# 1. Pandas: Series

In [0]:
import numpy as np
import pandas as pd

In [0]:
s = pd.Series(['a', 'b', 'c', 'd', 'e', 'f'])

In [5]:
print (s)

0    a
1    b
2    c
3    d
4    e
5    f
dtype: object


### the left column is index, the right column is the corresponding data

In [0]:
s = pd.Series([2,4,6,8])

In [7]:
print(s)

0    2
1    4
2    6
3    8
dtype: int64


### The series' index can be specified

In [0]:
s = pd.Series([2,4,5,9], index=['a','b','c','d'])

In [14]:
s

1    2
b    4
c    5
d    9
dtype: int64

In [21]:
#select a single index' value
s['a']

2

In [0]:
#select multiple indexes' values
s[['a', 'c']]

a    2
c    5
dtype: int64

In [0]:
s = pd.Series(range(4), index=['a','b','c','d'])

In [25]:
s

a    0
b    1
c    2
d    3
dtype: int64

In [0]:
s['a']

0

# 2.  Series' operations

In [26]:
s

a    0
b    1
c    2
d    3
dtype: int64

In [28]:
#select series by comparing it with a number
s[s>=2]

c    2
d    3
dtype: int64

In [0]:
#comparing the whole serie
s>4

a    False
b    False
c    False
d    False
dtype: bool

In [0]:
#mulitply a constant to a serie
s*2

a    0
b    2
c    4
d    6
dtype: int64

# 3. Incomplete data

In [0]:
#sometimes, the data is not complete
sdata ={'b':100, 'c':150, 'd':200}

In [30]:
s = pd.Series(sdata)
print(s)

b    100
c    150
d    200
dtype: int64


In [0]:
s = pd.Series(sdata, index=['a','b','c','d'])

In [33]:
print(s)

a      NaN
b    100.0
c    150.0
d    200.0
dtype: float64


### NaN: Not a number

# 4. Pandas: DataFrame

In [0]:
import numpy as np
import pandas as pd

In [0]:
frame1 = pd.DataFrame({'id':['ball', 'pencil', 'pen'], 
                       'price':[12.33, 11.44, 44]})

In [36]:
print(frame1)

       id  price
0    ball  12.33
1  pencil  11.44
2     pen  44.00


In [0]:
frame2 = pd.DataFrame({'id':['pencil', 'ball', 'pen'], 
                       'color':['red', 'blue', 'green']})

In [0]:
print(frame2)

       id  color
0  pencil    red
1    ball   blue
2     pen  green


# 5.  Merge operation

In [0]:
merge_frame = pd.merge(frame1, frame2)

In [0]:
print(merge_frame)

       id  price  color
0    ball  12.33   blue
1  pencil  11.44    red
2     pen  44.00  green


In [0]:
# The resulting DataFrame consists of all rows that 
# have an ID in common

In [0]:
# Usually you need to specify the merge column

In [0]:
merge_frame = pd.merge(frame1, frame2, on='id')

In [0]:
print(merge_frame)

       id  price  color
0    ball  12.33   blue
1  pencil  11.44    red
2     pen  44.00  green


# 6. Cautions: avoid ambiguity in merging

In [0]:
frame1 = pd.DataFrame({'id':['ball', 'pencil', 'pen'], 
                       'color':['white', 'red', 'blue'],
                       'brand':['OMG', 'ABC', 'POD']})

In [42]:
print(frame1)

       id  color brand
0    ball  white   OMG
1  pencil    red   ABC
2     pen   blue   POD


In [0]:
frame2 = pd.DataFrame({'id':['ball', 'pencil', 'pencil', 'pen'],
                        'brand':['OMG', 'ABC', 'POD', 'POD']})

In [44]:
print(frame2)

       id brand
0    ball   OMG
1  pencil   ABC
2  pencil   POD
3     pen   POD


In [45]:
merge_frame = pd.merge(frame1, frame2) #automatically ignore 'pencil-POD' in frame2
print(merge_frame)

       id  color brand
0    ball  white   OMG
1  pencil    red   ABC
2     pen   blue   POD


In [0]:
merge_frame = pd.merge(frame1, frame2, on='id') #automatically broadcast to avoid ambiguilty
print(merge_frame)

       id  color brand_x brand_y
0    ball  white     OMG     OMG
1  pencil    red     ABC     ABC
2  pencil    red     ABC     POD
3     pen   blue     POD     POD


In [0]:
merge_frame = pd.merge(frame1, frame2, on='brand') #automatically broadcast to avoid ambiguilty
merge_frame

Unnamed: 0,id_x,color,brand,id_y
0,ball,white,OMG,ball
1,pencil,red,ABC,pencil
2,pen,blue,POD,pencil
3,pen,blue,POD,pen


# 7. What if key columns in two DataFrames do not have the same name

In [0]:
frame1 = pd.DataFrame({'id':['ball', 'pencil', 'pen', 'mug', 'ashtray'],
                      'color':['white', 'red', 'red', 'black', 'green'],
                      'brand':['OMG', 'ABC', 'ABC', 'POD', 'POD']})

In [47]:
frame1

Unnamed: 0,id,color,brand
0,ball,white,OMG
1,pencil,red,ABC
2,pen,red,ABC
3,mug,black,POD
4,ashtray,green,POD


In [48]:
frame2 = pd.DataFrame({'sid':['pencil', 'pencil', 'bal', 'pen'],
                      'brand':['OMG', 'POD', 'ABC', 'POD']})
frame2

Unnamed: 0,sid,brand
0,pencil,OMG
1,pencil,POD
2,bal,ABC
3,pen,POD


In [49]:
pd.merge(frame1, frame2, left_on='id', right_on='sid')

Unnamed: 0,id,color,brand_x,sid,brand_y
0,pencil,red,ABC,pencil,OMG
1,pencil,red,ABC,pencil,POD
2,pen,red,ABC,pen,POD


#### Note: different versions of pandas may give different results
#### Always test before you use

# 8. Merge continue...

In [0]:
frame1 = pd.DataFrame({'id':['ball', 'pencil', 'pen', 'mug', 'ashtray'],
                      'color':['white', 'red', 'red', 'black', 'green'],
                      'brand':['OMG', 'ABC', 'ABC', 'POD', 'POD']})

frame2 = pd.DataFrame({'id':['pencil', 'pencil', 'bal', 'pen'],
                      'brand':['OMG', 'POD', 'ABC', 'POD']})

## (1) left join: all rows from left frame and matched rows from right frame

In [0]:
pd.merge(frame1, frame2, on='id', how='left')

Unnamed: 0,id,color,brand_x,brand_y
0,ball,white,OMG,
1,pencil,red,ABC,OMG
2,pencil,red,ABC,POD
3,pen,red,ABC,POD
4,mug,black,POD,
5,ashtray,green,POD,


## (2) right join: all rows from right frame and matched rows from left frame

In [0]:
pd.merge(frame1, frame2, on='id', how='right')

Unnamed: 0,id,color,brand_x,brand_y
0,pencil,red,ABC,OMG
1,pencil,red,ABC,POD
2,pen,red,ABC,POD
3,bal,,,ABC


## (3) outer join: rows included from both frames

In [0]:
pd.merge(frame1, frame2, on='id', how='outer')

Unnamed: 0,id,color,brand_x,brand_y
0,ball,white,OMG,
1,pencil,red,ABC,OMG
2,pencil,red,ABC,POD
3,pen,red,ABC,POD
4,mug,black,POD,
5,ashtray,green,POD,
6,bal,,,ABC


## (4) To merge multiple keys, simply add a list to the on option 

In [52]:
pd.merge(frame1, frame2, on=['id', 'brand'], how='outer')

Unnamed: 0,id,color,brand
0,ball,white,OMG
1,pencil,red,ABC
2,pen,red,ABC
3,mug,black,POD
4,ashtray,green,POD
5,pencil,,OMG
6,pencil,,POD
7,bal,,ABC
8,pen,,POD


# 9. Concatenating

In [0]:
ser1 = pd.Series(np.random.rand(4), index=[1,2,3,4])
ser2 = pd.Series(np.random.rand(4), index=[5,6,7,8])

In [55]:
print(ser1)

1    0.997299
2    0.865165
3    0.580570
4    0.415960
dtype: float64


In [56]:
print(ser2)

5    0.409901
6    0.218293
7    0.517104
8    0.592321
dtype: float64


In [0]:
ser3 = pd.concat([ser1, ser2])
print(ser3)

1    0.164693
2    0.646801
3    0.646941
4    0.294920
5    0.449531
6    0.985644
7    0.215818
8    0.305224
dtype: float64


# 10. Concatenating dataframes

In [0]:
import pandas as pd
import numpy as np
frame1 = pd.DataFrame(np.random.rand(9).reshape(3,3), index=[1,2,3], columns=['A','B','C'])

In [65]:
frame1

Unnamed: 0,A,B,C
1,0.994811,0.755574,0.743672
2,0.073688,0.910137,0.18364
3,0.313254,0.375571,0.94183


In [0]:
frame2 = pd.DataFrame(np.random.rand(9).reshape(3,3), index=[4,5,6], columns=['A','B','C'])

In [67]:
frame2

Unnamed: 0,A,B,C
4,0.547571,0.390271,0.283775
5,0.085853,0.587533,0.235823
6,0.303593,0.194694,0.218747


In [68]:
pd.concat([frame1, frame2])

Unnamed: 0,A,B,C
1,0.994811,0.755574,0.743672
2,0.073688,0.910137,0.18364
3,0.313254,0.375571,0.94183
4,0.547571,0.390271,0.283775
5,0.085853,0.587533,0.235823
6,0.303593,0.194694,0.218747


#### The default concatenation axis is axis 0

In [73]:
pd.concat([frame1, frame2], axis=1)

Unnamed: 0,A,B,C,A.1,B.1,C.1
1,0.994811,0.755574,0.743672,,,
2,0.073688,0.910137,0.18364,,,
3,0.313254,0.375571,0.94183,,,
4,,,,0.547571,0.390271,0.283775
5,,,,0.085853,0.587533,0.235823
6,,,,0.303593,0.194694,0.218747


# 11. Combine

In [0]:
# two datasets share the same index or partially 

In [75]:
ser1 = pd.Series(np.random.rand(5),index=[1,2,3,4,5])
print(ser1)

1    0.525024
2    0.436127
3    0.106351
4    0.604588
5    0.785601
dtype: float64


In [0]:
ser2 = pd.Series(np.random.rand(5),index=[4,5,6,7,8])

In [77]:
print(ser2)

4    0.807741
5    0.455233
6    0.495536
7    0.046609
8    0.661808
dtype: float64


In [78]:
ser1.combine_first(ser2)

1    0.525024
2    0.436127
3    0.106351
4    0.604588
5    0.785601
6    0.495536
7    0.046609
8    0.661808
dtype: float64

In [0]:
# if only partial overlap is wanted

In [79]:
ser1[:3].combine_first(ser2[:3])

1    0.525024
2    0.436127
3    0.106351
4    0.807741
5    0.455233
6    0.495536
dtype: float64

# 12. Removing columns and rows

In [0]:
frame1=pd.DataFrame(np.arange(9).reshape(3,3), 
                    index=['white','black','red'], 
                    columns=['ball','pen','pencil'])

In [0]:
frame1

Unnamed: 0,ball,pen,pencil
white,0,1,2
black,3,4,5
red,6,7,8


In [0]:
del frame1['ball'] #delete column

In [0]:
frame1

Unnamed: 0,pen,pencil
white,1,2
black,4,5
red,7,8


In [82]:
frame1.drop('white') #delete row with index white

Unnamed: 0,pen,pencil
black,4,5
red,7,8


# 13&14. Data aggregation: Groupby

In [0]:
frame = pd.DataFrame({'color':['white', 'red', 'green', 'red', 'green'],
                      'object': ['pen', 'pencil', 'pencil', 'ashtray', 'pen'],
                      'price1': [5.5, 4.2, 1.3, 0.56, 2.75],
                      'price2': [4.75, 4.12, 1.60, 0.75, 3.15]})

In [85]:
frame

Unnamed: 0,color,object,price1,price2
0,white,pen,5.5,4.75
1,red,pencil,4.2,4.12
2,green,pencil,1.3,1.6
3,red,ashtray,0.56,0.75
4,green,pen,2.75,3.15


In [0]:
### If we cant to caculate the average prices of price1 for each color group

In [0]:
group = frame['price1'].groupby(frame['color'])

In [88]:
print(group) #This returns a groupby operation object only

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7fb6965ddb38>


In [89]:
group.groups #It contains a dictionary to store the information

{'green': Int64Index([2, 4], dtype='int64'),
 'red': Int64Index([1, 3], dtype='int64'),
 'white': Int64Index([0], dtype='int64')}

In [90]:
group.sum() #Calculate the sum of prices

color
green    4.05
red      4.76
white    5.50
Name: price1, dtype: float64

In [91]:
group.mean() #Calculate the mean of prices

color
green    2.025
red      2.380
white    5.500
Name: price1, dtype: float64

In [92]:
for name, group in frame.groupby('color'): #Print details of the group
    print(name)
    print(group)

green
   color  object  price1  price2
2  green  pencil    1.30    1.60
4  green     pen    2.75    3.15
red
  color   object  price1  price2
1   red   pencil    4.20    4.12
3   red  ashtray    0.56    0.75
white
   color object  price1  price2
0  white    pen     5.5    4.75
