In [1]:
import pandas as pd
import numpy as np
import re

# generate datetime index

In [2]:
dates = pd.date_range('1/1/2000', periods=7)
dates

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07'],
              dtype='datetime64[ns]', freq='D')

# drop NaN

In [3]:
dropNanDf = pd.DataFrame([
    [1.1, 2.2, 3.3], 
    [4.4, np.NaN, np.NaN],
    [np.NaN, np.NaN, np.NaN], 
    [np.NaN, 11.11, 12.12]
])
dropNanDf

Unnamed: 0,0,1,2
0,1.1,2.2,3.3
1,4.4,,
2,,,
3,,11.11,12.12


In [4]:
dropNanDf.dropna()

Unnamed: 0,0,1,2
0,1.1,2.2,3.3


In [5]:
dropNanDf.dropna(how='all')

Unnamed: 0,0,1,2
0,1.1,2.2,3.3
1,4.4,,
3,,11.11,12.12


In [6]:
dropNanDf.dropna(thresh=2)

Unnamed: 0,0,1,2
0,1.1,2.2,3.3
3,,11.11,12.12


# fill NaN

In [7]:
fillNanDf = pd.DataFrame([
    [1.1, 2.2, 3.3], 
    [4.4, np.NaN, np.NaN],
    [np.NaN, np.NaN, np.NaN], 
    [np.NaN, 11.11, 12.12]
])
fillNanDf

Unnamed: 0,0,1,2
0,1.1,2.2,3.3
1,4.4,,
2,,,
3,,11.11,12.12


In [8]:
fillNanDf.fillna(-1)

Unnamed: 0,0,1,2
0,1.1,2.2,3.3
1,4.4,-1.0,-1.0
2,-1.0,-1.0,-1.0
3,-1.0,11.11,12.12


In [9]:
fillNanDf.fillna({0: - 100, 1: -200, 2: -300})

Unnamed: 0,0,1,2
0,1.1,2.2,3.3
1,4.4,-200.0,-300.0
2,-100.0,-200.0,-300.0
3,-100.0,11.11,12.12


In [10]:
fillNanDf.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1.1,2.2,3.3
1,4.4,2.2,3.3
2,4.4,2.2,3.3
3,4.4,11.11,12.12


In [11]:
fillNanDf.fillna(method='ffill', limit=1)

Unnamed: 0,0,1,2
0,1.1,2.2,3.3
1,4.4,2.2,3.3
2,4.4,,
3,,11.11,12.12


# remove duplicate

In [12]:
removeDuplicateDf = pd.DataFrame({
    'col_0': ['one', 'two'] * 3 + ['two'],
    'col_1': [1, 1, 2, 3, 3, 4, 4]
})
removeDuplicateDf

Unnamed: 0,col_0,col_1
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [13]:
removeDuplicateDf.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [14]:
removeDuplicateDf.drop_duplicates()

Unnamed: 0,col_0,col_1
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [15]:
removeDuplicateDf.drop_duplicates(['col_0'])

Unnamed: 0,col_0,col_1
0,one,1
1,two,1


In [16]:
removeDuplicateDf.drop_duplicates(['col_0'], keep='last')

Unnamed: 0,col_0,col_1
4,one,3
6,two,4


# mapping data

In [17]:
mappingDataDf = pd.DataFrame({
    'col_0': ['one', 'two'] * 3 + ['two'],
    'col_1': [1, 1, 2, 3, 3, 4, 4]
})
mappingDataDf

Unnamed: 0,col_0,col_1
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [18]:
mappingData = {
    'one': '111',
    'two': '222'
}
mappingData

{'one': '111', 'two': '222'}

In [19]:
mappingDataDf['col_2'] = mappingDataDf['col_0'].map(mappingData)
mappingDataDf

Unnamed: 0,col_0,col_1,col_2
0,one,1,111
1,two,1,222
2,one,2,111
3,two,3,222
4,one,3,111
5,two,4,222
6,two,4,222


In [20]:
mappingDataDf['col_3'] = mappingDataDf['col_1'].map(lambda x : x * 10)
mappingDataDf

Unnamed: 0,col_0,col_1,col_2,col_3
0,one,1,111,10
1,two,1,222,10
2,one,2,111,20
3,two,3,222,30
4,one,3,111,30
5,two,4,222,40
6,two,4,222,40


# replace data

In [21]:
replaceDataDf = pd.DataFrame({
    'col_0': [0, 1, 2],
    'col_1': [1, 2, 3],
    'col_2': [2, 3, 4]
})
replaceDataDf

Unnamed: 0,col_0,col_1,col_2
0,0,1,2
1,1,2,3
2,2,3,4


In [22]:
replaceDataDf.replace({1: np.nan, 2: 999})

Unnamed: 0,col_0,col_1,col_2
0,0.0,,999.0
1,,999.0,3.0
2,999.0,3.0,4.0


# binning data

In [23]:
binningDataArr = range(10)
binningDataArr

range(0, 10)

In [24]:
binningCategory = pd.cut(binningDataArr, [0, 3, 7, 10], right=False, labels=['group_0', 'group_1', 'group_2'])
binningCategory.value_counts()

group_0    3
group_1    4
group_2    3
dtype: int64

# quantile binning data

In [25]:
quantileBinningDataArr = range(10)
quantileBinningDataArr

range(0, 10)

In [26]:
quantileBinningCategory = pd.qcut(quantileBinningDataArr, 4, labels=['group_0', 'group_1', 'group_2', 'group_3'])
quantileBinningCategory.value_counts()

group_0    3
group_1    2
group_2    2
group_3    3
dtype: int64

# processing outliers

In [27]:
processingOutliersDf = pd.DataFrame(np.random.randn(1000, 4))
processingOutliersDf.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.02429,-0.013644,-0.007653,-0.022722
std,0.983841,0.963776,1.005689,1.025081
min,-3.184288,-3.392549,-3.006771,-2.75847
25%,-0.649017,-0.665254,-0.692262,-0.745919
50%,-0.009007,-0.049716,-0.054977,-0.024507
75%,0.742625,0.671504,0.648856,0.705392
max,3.020238,2.775171,3.394307,3.465742


In [28]:
processingOutlierColDf = processingOutliersDf[0]
processingOutlierColDf[np.abs(processingOutlierColDf) > 3]

111    3.020238
600   -3.184288
Name: 0, dtype: float64

In [29]:
processingOutliersDf[(np.abs(processingOutliersDf) > 3).any(1)]

Unnamed: 0,0,1,2,3
18,1.117079,0.3103,-0.933655,3.340034
109,-0.59287,-0.519112,-3.006771,1.361079
111,3.020238,0.421884,1.85176,-1.373482
219,-1.445028,-3.099295,0.257384,-2.284819
361,-0.169063,-1.65847,0.60836,3.465742
600,-3.184288,2.146542,-0.881325,-1.189652
624,0.094992,0.336872,3.394307,0.92297
836,0.599736,0.768038,0.525864,3.03339
842,1.020463,-1.986128,3.171229,0.439654
962,0.488081,-3.392549,-0.81885,0.524156


In [30]:
processingOutliersDf[np.abs(processingOutliersDf) > 3]

Unnamed: 0,0,1,2,3
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,
5,,,,
6,,,,
7,,,,
8,,,,
9,,,,


In [31]:
processingOutliersDf[np.abs(processingOutliersDf) > 3] = np.sign(processingOutliersDf) * 3
processingOutliersDf

Unnamed: 0,0,1,2,3
0,1.130670,0.286027,-1.365768,-0.236195
1,0.939172,0.699501,1.189895,-0.979461
2,-1.136555,-0.040395,0.153876,-0.235864
3,-0.722295,-1.529103,-0.897711,-0.344310
4,0.904341,-0.183510,0.488359,1.473038
5,0.274489,-1.093369,0.517638,-1.139070
6,1.661969,0.462754,2.323653,-0.529238
7,-0.010258,1.433307,0.394644,-0.523258
8,-0.363973,-1.398269,1.319785,0.012721
9,0.641540,-0.438663,0.456106,-0.338422


# sampling data

In [32]:
samplingDataDf = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
samplingDataDf

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [33]:
samplingDataSampler = np.random.permutation(5)
samplingDataSampler

array([0, 4, 3, 1, 2])

In [34]:
samplingDataDf.take(samplingDataSampler)

Unnamed: 0,0,1,2,3
0,0,1,2,3
4,16,17,18,19
3,12,13,14,15
1,4,5,6,7
2,8,9,10,11


In [35]:
samplingDataDf.sample(n=3)

Unnamed: 0,0,1,2,3
4,16,17,18,19
2,8,9,10,11
1,4,5,6,7


# dummy variable

In [36]:
dummyVariableDf = pd.DataFrame({'col_0': ['b', 'b', 'a', 'c', 'a', 'b']})
dummyVariableDf

Unnamed: 0,col_0
0,b
1,b
2,a
3,c
4,a
5,b


In [37]:
pd.get_dummies(dummyVariableDf['col_0'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [38]:
dummyVariable = np.random.rand(10)
dummyVariable

array([0.78821501, 0.53316194, 0.64619846, 0.67087166, 0.68815645,
       0.63843844, 0.12880829, 0.24834596, 0.05333795, 0.89217007])

In [39]:
dummyVariableBins = [0, 0.2, 0.4, 0.6, 0.8, 1]
dummyVariableBins

[0, 0.2, 0.4, 0.6, 0.8, 1]

In [40]:
pd.get_dummies(pd.cut(dummyVariable, dummyVariableBins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,1,0
1,0,0,1,0,0
2,0,0,0,1,0
3,0,0,0,1,0
4,0,0,0,1,0
5,0,0,0,1,0
6,1,0,0,0,0
7,0,1,0,0,0
8,1,0,0,0,0
9,0,0,0,0,1


# string function

In [41]:
stringFunctionDf = pd.DataFrame({
    'col_0': ['abc', 'xyz', '123']
})
stringFunctionDf

Unnamed: 0,col_0
0,abc
1,xyz
2,123


In [42]:
stringFunctionDf['col_0'].str.contains('xy')

0    False
1     True
2    False
Name: col_0, dtype: bool

In [43]:
stringFunctionDf['col_0'].str.findall('[a1]', flags=re.IGNORECASE)

0    [a]
1     []
2    [1]
Name: col_0, dtype: object

In [44]:
stringFunctionDf['col_0'].str.match('[a1]', flags=re.IGNORECASE)

0     True
1    False
2     True
Name: col_0, dtype: bool