In [None]:
import pandas as pd

cereal = pd.read_csv('https://raw.githubusercontent.com/UBC-MDS/programming-in-python-for-data-science/master/data/cereal.csv')
cereal = cereal.drop(labels=['type', 'shelf', 'weight', 'cups', 'rating'], axis=1)
cereal.head()

In [None]:
cereal_df = cereal.groupby('mfr')
cereal_df

In [None]:
cereal.head()

### Week 3

- Tidy Data
- Pivot & Melt
- Difference Between Pivot and Pivot Table
- Merging Data Frames

## Tidy Data

* each row is a single observation,
* each column is a single variable, and
* each value is a single cell (i.e., its entry in the data frame is not shared with another value).

- The concept stems from a paper written by Hadley Wickham in 2014.

- We tidy our data to create a standard across multiple analysis tools. 

![](https://datasciencebook.ca/_main_files/figure-html/02-tidy-image-1.png)

## What to do when data is not tidy...

Data sometimes comes in tables that are easy to read/make sense for humans.

They are not necessarily the best format for computers. Sometimes they can also not provide the information very straight forward.

## Wide to Long

## Goal

![](https://datasciencebook.ca/img/pivot_functions/pivot_functions.001.jpeg)

## Discussion

Why do we do this? 

- Looking at the table, which city has the largest population and in what year was it?

What do you think we could do to make it even easier to find?

## From long to wide

![](https://datasciencebook.ca/img/pivot_functions/pivot_functions.002.jpeg)

### How to Manipulate Long and Wide Tables In Python

### Melt

> wide to long format

[Documentation](https://pandas.pydata.org/docs/reference/api/pandas.melt.html)

![](https://pandas.pydata.org/docs/_images/reshaping_melt.png)

In [None]:
import pandas as pd

cereal = pd.read_csv('https://raw.githubusercontent.com/UBC-MDS/programming-in-python-for-data-science/master/data/cereal.csv')
cereal = cereal.drop(labels=['type', 'shelf', 'weight', 'cups', 'rating'], axis=1)
cereal.head()

In [None]:
cereal[cereal['name'] == '100% Bran']

In [None]:
cereal_long = cereal.melt(id_vars=['name', 'mfr'])
cereal_long[cereal_long['name'] == '100% Bran']

### Pivot

> long to wide

[Documentation](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.pivot.html)


![](https://pandas.pydata.org/docs/_images/reshaping_pivot.png)

In [None]:
lego = pd.read_csv('https://raw.githubusercontent.com/UBC-MDS/programming-in-python-for-data-science/master/data/lego_untidy.csv')
lego.head()

In [None]:
lego_tidy = lego.pivot(index='set_num',
                       columns='lego_info',
                       values='value')
lego_tidy.head()

In [None]:
lego_tidy2 = lego_tidy.reset_index()
lego_tidy2

In [None]:
# What if I want to use the name?
lego.pivot(index='name',
           columns='lego_info',
           values='value')

In [None]:
lego_tidy3 = lego.pivot_table(index=['name', 'set_num'],
           columns='lego_info',
           values='value')
lego_tidy3

In [None]:
lego_tidy3.reset_index().value_counts('name')

In [None]:
lt4 = lego_tidy3.reset_index()

lt4[lt4['name']=='Basic Building Set']

## Merging Data Frames

In [None]:
df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
                              'Parrot', 'Parrot'],
                   'id' : [1,2,1,2],
                   'Max Speed': [390., 370., 24., 26.]})

df_food = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
                                   'Parrot', 'Parrot'],
                        'id' : [1,2,1,2],
                        'Food': ['Dry', 'Meat', 'Seeds', 'Fruits']})

In [None]:
df.head()

In [None]:
df_food

In [None]:
df.merge(df_food, on = 'Animal')

In [None]:
df.merge(df_food, on = ['Animal', 'id'])

In [None]:
df = pd.DataFrame({'an': ['Falcon', 'Falcon',
                              'Parrot', 'Parrot'],
                   'Id' : [1,2,1,2],
                   'Max Speed': [380., 370., 24., 26.]})
df_food = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
                                   'Parrot', 'Parrot'],
                        'id' : [1,2,1,2],
                        'Food': ['Dry', 'Meat', 'Seeds', 'Fruits']})

In [None]:
df_animal = pd.DataFrame({'an': ['Falcon', 'Falcon',
                              'Parrot', 'Parrot'],
                   'Id' : [1,2,1,2],
                   'Max Speed': [380., 370., 24., 26.]})

In [None]:
df

In [None]:
df_food

In [None]:
df.merge(df_food, left_on = ['an', 'Id'], right_on = ['Animal', 'id'], copy = False)

In [30]:
import pandas as pd
df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', 'Falcon',
                              'Parrot', 'Parrot'],
                   'id' : [1,2,1, 1 ,2],
                   'Max Speed': [390., 350, 370., 24., 26.]})

df_food = pd.DataFrame({'Animal': ['Falcon', 'Falcon','Parrot', 'Parrot'
                                   ],
                        'id' : [1,2,1,2],
                        'Food': ['Dry', 'Meat', 'Seeds', 'Fruits']})

In [31]:
df

Unnamed: 0,Animal,id,Max Speed
0,Falcon,1,390.0
1,Falcon,2,350.0
2,Falcon,1,370.0
3,Parrot,1,24.0
4,Parrot,2,26.0


In [9]:
df_food

Unnamed: 0,Animal,id,Food
0,Falcon,1,Dry
1,Falcon,2,Meat
2,Parrot,1,Seeds
3,Parrot,2,Fruits


In [4]:
df.merge(df_food, on = ['Animal', 'id'])

Unnamed: 0,Animal,id,Max Speed,Food
0,Falcon,1,390.0,Dry
1,Falcon,2,370.0,Meat
2,Parrot,1,24.0,Seeds
3,Parrot,2,26.0,Fruits


In [5]:
pd.concat([df, df_food])

Unnamed: 0,Animal,id,Max Speed,Food
0,Falcon,1,390.0,
1,Falcon,2,370.0,
2,Parrot,1,24.0,
3,Parrot,2,26.0,
0,Parrot,1,,Seeds
1,Parrot,2,,Fruits
2,Falcon,1,,Dry
3,Falcon,2,,Meat


In [10]:
pd.concat([df, df_food], axis = 1)

Unnamed: 0,Animal,id,Max Speed,Animal.1,id.1,Food
0,Falcon,1,390.0,Falcon,1,Dry
1,Falcon,2,370.0,Falcon,2,Meat
2,Parrot,1,24.0,Parrot,1,Seeds
3,Parrot,2,26.0,Parrot,2,Fruits


# DataFrames and Series 

python data objects

- numbers (integers/ floats)
- strings
- lists
- dictionaries


pandas objects
- series
- dataframes

In [11]:
5

5

In [12]:
5+6

11

In [13]:
'Hello'

'Hello'

In [14]:
[1, 2, 4]

[1, 2, 4]

In [15]:
['hello', 'world']

['hello', 'world']

In [16]:
[1, 'hello']

[1, 'hello']

In [32]:
df

Unnamed: 0,Animal,id,Max Speed
0,Falcon,1,390.0
1,Falcon,2,350.0
2,Falcon,1,370.0
3,Parrot,1,24.0
4,Parrot,2,26.0


In [24]:
my_list = ['Animal', 'Max Speed']
my_list

['Animal', 'Max Speed']

In [27]:
df[['Animal', 'Max Speed']]

Unnamed: 0,Animal,Max Speed
0,Falcon,390.0
1,Falcon,370.0
2,Parrot,24.0
3,Parrot,26.0


In [38]:
df.groupby(['Animal', 'id']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Max Speed
Animal,id,Unnamed: 2_level_1
Falcon,1,380.0
Falcon,2,350.0
Parrot,1,24.0
Parrot,2,26.0


In [18]:
df.shape

(4, 3)

In [19]:
df.columns

Index(['Animal', 'id', 'Max Speed'], dtype='object')

In [20]:
df['Animal']

0    Falcon
1    Falcon
2    Parrot
3    Parrot
Name: Animal, dtype: object

In [21]:
df2= df[['Animal', 'Max Speed']]
df2

Unnamed: 0,Animal,Max Speed
0,Falcon,390.0
1,Falcon,370.0
2,Parrot,24.0
3,Parrot,26.0


In [22]:

df2.groupby('Animal').mean()

Unnamed: 0_level_0,Max Speed
Animal,Unnamed: 1_level_1
Falcon,380.0
Parrot,25.0
