In [16]:
import pandas as pd
import numpy as np

In [6]:
#Convert a list to df
l1 = [1,5,9,4,6]
df = pd.DataFrame(l1)
df.columns = ['col_name']
df

Unnamed: 0,col_name
0,1
1,5
2,9
3,4
4,6


In [15]:
#l2 = {'a':[1,5,9,4,6], 'b':[5,7,6,9,10]}
l2 = [[1,5,9,4,6], [5,7,6,9,10]]
df2 = pd.DataFrame(l2).transpose()
df2.columns = ['col_1', 'col_2']
df2

Unnamed: 0,col_1,col_2
0,1,5
1,5,7
2,9,6
3,4,9
4,6,10


## Handling Null Value

In [17]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocade'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocade
dtype: object

In [18]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [19]:
string_data[0] = None
string_data

0         None
1    artichoke
2          NaN
3      avocade
dtype: object

In [20]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [22]:
#Filter out null value
string_data[string_data.notnull()]

1    artichoke
3      avocade
dtype: object

In [24]:
#dropna in df will drop rows containing a missing value
df = pd.DataFrame([[1,6.5, 3.], [1, np.nan, np.nan], [np.nan,np.nan, np.nan]])
df

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,


In [25]:
#Drop both row 2 and row 3
#Use it with caution
df.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [27]:
#passing how=all will only drop row with all values are na
df.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,


In [30]:
df[df[2] != 3.0]

Unnamed: 0,0,1,2
1,1.0,,
2,,,


In [31]:
df[4] = np.nan
df

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,


In [33]:
#add axis = 1 will drop column
df.dropna(axis = 1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,


In [35]:
df.fillna(0)

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,0.0
1,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0


In [38]:
#fill na can be filled with dictionary which fill different number in different column
df.fillna({0:0.000001, 2:0.00005})

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,5e-05,
2,1e-06,,5e-05,


## Data Transformation

In [40]:
d = {'col1': ['one', 'two'] * 3 + ['two'],  'col2': [1,1,2,3,3,4,4]}
df2 = pd.DataFrame(d)
df2

Unnamed: 0,col1,col2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [43]:
df2.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [44]:
df2.drop_duplicates()

Unnamed: 0,col1,col2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [45]:
df2['v1'] = range(7)
df2

Unnamed: 0,col1,col2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [46]:
#drop duplicated will treat all columns, however you can treat duplicated on only one column
df2.drop_duplicates(['col1'])

Unnamed: 0,col1,col2,v1
0,one,1,0
1,two,1,1


In [None]:
#map function will map a column to its new value (page 199~200)

## Detecting and Filtering OUtliers

In [47]:
data = pd.DataFrame(np.random.randn(1000,4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.019066,0.007153,-0.008693,0.039481
std,0.984276,1.025772,0.983857,1.015938
min,-3.149069,-3.086772,-2.868985,-3.342066
25%,-0.650913,-0.719953,-0.684507,-0.673835
50%,-0.035475,0.019267,0.008403,-0.006694
75%,0.650601,0.68322,0.665861,0.745261
max,3.004278,3.100976,3.021697,3.332053


In [50]:
#in column1, max value is 3.10, and min value is -3.08 so maybe anything larger than 3 or smaller than -3 are outlier
col = data[np.abs(data[1]) > 3]
col

Unnamed: 0,0,1,2,3
406,-0.142205,-3.086772,0.433636,-1.309129
483,-0.207248,-3.04601,-1.180079,1.935232
857,-0.01893,3.100976,-1.68291,-0.664559


In [52]:
#any is sort of ignore columns, and try to find whether any value in any column is greater than 3
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
349,-3.149069,0.713289,0.328333,0.258608
379,0.951154,-1.600614,3.017108,-0.469842
406,-0.142205,-3.086772,0.433636,-1.309129
483,-0.207248,-3.04601,-1.180079,1.935232
600,-0.930888,0.68241,3.021697,1.764193
737,0.956958,0.630315,-0.334428,-3.342066
745,0.052103,-1.771868,-0.851649,3.332053
766,3.004278,0.448491,-2.006783,1.512742
857,-0.01893,3.100976,-1.68291,-0.664559


In [53]:
data1 = data.head(5)
data1

Unnamed: 0,0,1,2,3
0,-0.99952,-0.034176,1.64584,-0.927079
1,-0.11528,-0.436119,1.895176,1.795754
2,-0.606994,-0.877873,-0.180882,0.134565
3,0.215386,0.517771,-0.221405,0.812861
4,1.031284,-1.171717,0.892808,0.003435


In [55]:
data_to_add = pd.DataFrame(np.zeros(24).reshape(6,4))
data_to_add

Unnamed: 0,0,1,2,3
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0


In [56]:
#there is also append function to append new rows
data1 + data_to_add

Unnamed: 0,0,1,2,3
0,-0.99952,-0.034176,1.64584,-0.927079
1,-0.11528,-0.436119,1.895176,1.795754
2,-0.606994,-0.877873,-0.180882,0.134565
3,0.215386,0.517771,-0.221405,0.812861
4,1.031284,-1.171717,0.892808,0.003435
5,,,,


In [57]:
## Create dummy data
df = pd.DataFrame({'key': ['b','b', 'a', 'c', 'a', 'b'], 'data1': range(6)})
df

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,b


In [60]:
df.get_dummies(df['key'])

AttributeError: 'DataFrame' object has no attribute 'get_dummies'

## String manipulation

In [63]:
s = 'ab, i lo'
s_split = s.split(',')
s_split

['ab', ' i lo']

In [65]:
s_split2 = [i.strip() for i in s_split]
s_split2

['ab', 'i lo']

In [66]:
#Concatenate
'::'.join(s_split2)

'ab::i lo'

In [67]:
'b,' in s

True

In [68]:
s.index(',')

2

In [69]:
#find will not raise exception
s.find('xxxx')

-1

In [70]:
s.count('a')

1