In [1]:
import pandas as pd
import numpy as np
import re

# create DataFrame

In [2]:
data = {
    'string': ['a', 'b', 'c', 'd', 'e', 'f'],
    'int': [0, 1, 2, 1, 2, 3],
    'float': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
}

In [3]:
df = pd.DataFrame(data)
df

Unnamed: 0,float,int,string
0,1.5,0,a
1,1.7,1,b
2,3.6,2,c
3,2.4,1,d
4,2.9,2,e
5,3.2,3,f


In [4]:
dfWithIndex = pd.DataFrame(data, index=["one", "two", "three", "four", "five", "six"])
dfWithIndex

Unnamed: 0,float,int,string
one,1.5,0,a
two,1.7,1,b
three,3.6,2,c
four,2.4,1,d
five,2.9,2,e
six,3.2,3,f


In [5]:
dfWithDict = pd.DataFrame({"float":{"one":1.1, "two":2.2, "three":3.3}, "int":{"two":2, "three":3, "four":4}, "string":{"three":"aaa", "four":"bbb", "five":"ccc"}})
dfWithDict

Unnamed: 0,float,int,string
five,,,ccc
four,,4.0,bbb
one,1.1,,
three,3.3,3.0,aaa
two,2.2,2.0,


# check index is unique

In [6]:
dfWithIndex.index.is_unique

True

# sort by index

In [7]:
dfWithIndex.sort_index()

Unnamed: 0,float,int,string
five,2.9,2,e
four,2.4,1,d
one,1.5,0,a
six,3.2,3,f
three,3.6,2,c
two,1.7,1,b


In [8]:
dfWithIndex.sort_index(axis=1, ascending=False)

Unnamed: 0,string,int,float
one,a,0,1.5
two,b,1,1.7
three,c,2,3.6
four,d,1,2.4
five,e,2,2.9
six,f,3,3.2


# sort by value

In [9]:
dfWithIndex.sort_values(by=["int", "string"])

Unnamed: 0,float,int,string
one,1.5,0,a
two,1.7,1,b
four,2.4,1,d
three,3.6,2,c
five,2.9,2,e
six,3.2,3,f


# head

In [10]:
df.head()

Unnamed: 0,float,int,string
0,1.5,0,a
1,1.7,1,b
2,3.6,2,c
3,2.4,1,d
4,2.9,2,e


# set columns order

In [11]:
columnsOrderDf = pd.DataFrame(data, columns=['string', 'int', 'float'])
columnsOrderDf

Unnamed: 0,string,int,float
0,a,0,1.5
1,b,1,1.7
2,c,2,3.6
3,d,1,2.4
4,e,2,2.9
5,f,3,3.2


# get specific column

In [12]:
df["string"]

0    a
1    b
2    c
3    d
4    e
5    f
Name: string, dtype: object

In [13]:
df.string

0    a
1    b
2    c
3    d
4    e
5    f
Name: string, dtype: object

# get specific row

In [14]:
dfWithIndex.loc["three", ["int", "string"]]

int       2
string    c
Name: three, dtype: object

In [15]:
dfWithIndex.iloc[2, [1, 2]]

int       2
string    c
Name: three, dtype: object

# update value

In [16]:
updateValueDf = pd.DataFrame(data)

In [17]:
updateValueDf["string"] = "x"
updateValueDf

Unnamed: 0,float,int,string
0,1.5,0,x
1,1.7,1,x
2,3.6,2,x
3,2.4,1,x
4,2.9,2,x
5,3.2,3,x


In [18]:
updateValueDf["float"] = np.arange(6.)
updateValueDf

Unnamed: 0,float,int,string
0,0.0,0,x
1,1.0,1,x
2,2.0,2,x
3,3.0,1,x
4,4.0,2,x
5,5.0,3,x


In [19]:
series = pd.Series([100, 300, 500], index=[1, 3, 5])
updateValueDf["float"] = series
updateValueDf

Unnamed: 0,float,int,string
0,,0,x
1,100.0,1,x
2,,2,x
3,300.0,1,x
4,,2,x
5,500.0,3,x


# add new column

In [20]:
addNewColumnDf = pd.DataFrame(data)
addNewColumnDf['new_column'] = addNewColumnDf.int > 1
addNewColumnDf

Unnamed: 0,float,int,string,new_column
0,1.5,0,a,False
1,1.7,1,b,False
2,3.6,2,c,True
3,2.4,1,d,False
4,2.9,2,e,True
5,3.2,3,f,True


# delete column

In [21]:
deleteColumnDf_0 = pd.DataFrame(data)
del deleteColumnDf_0["int"]
deleteColumnDf_0

Unnamed: 0,float,string
0,1.5,a
1,1.7,b
2,3.6,c
3,2.4,d
4,2.9,e
5,3.2,f


In [22]:
deleteColumnDf_1 = pd.DataFrame(data)
deleteColumnDf_1.drop(["string"], axis='columns')

Unnamed: 0,float,int
0,1.5,0
1,1.7,1
2,3.6,2
3,2.4,1
4,2.9,2
5,3.2,3


In [23]:
deleteColumnDf_inplace = pd.DataFrame(data)
deleteColumnDf_inplace.drop(["string"], axis='columns', inplace=True)
deleteColumnDf_inplace

Unnamed: 0,float,int
0,1.5,0
1,1.7,1
2,3.6,2
3,2.4,1
4,2.9,2
5,3.2,3


# delete record

In [24]:
deleteRecordDf = pd.DataFrame(data)
deleteRecordDf.drop([2,4])

Unnamed: 0,float,int,string
0,1.5,0,a
1,1.7,1,b
3,2.4,1,d
5,3.2,3,f


In [25]:
deleteRecordDf_inplace = pd.DataFrame(data)
deleteRecordDf_inplace.drop([2,4], inplace=True)
deleteRecordDf_inplace

Unnamed: 0,float,int,string
0,1.5,0,a
1,1.7,1,b
3,2.4,1,d
5,3.2,3,f


# transpose

In [26]:
transposedDf = pd.DataFrame(data)
transposedDf.T

Unnamed: 0,0,1,2,3,4,5
float,1.5,1.7,3.6,2.4,2.9,3.2
int,0,1,2,1,2,3
string,a,b,c,d,e,f


# sub dataframe

In [27]:
subDfByDict = {"subInt" : df.int[1:5], "subFloat" : df.float[:-3]}
pd.DataFrame(subDfByDict)

Unnamed: 0,subFloat,subInt
0,1.5,
1,1.7,1.0
2,3.6,2.0
3,,1.0
4,,2.0


In [28]:
dfWithIndex.loc["two":"five", "float":"string"]

Unnamed: 0,float,int,string
two,1.7,1,b
three,3.6,2,c
four,2.4,1,d
five,2.9,2,e


In [29]:
df.iloc[:, :3][df.index % 2 == 0]

Unnamed: 0,float,int,string
0,1.5,0,a
2,3.6,2,c
4,2.9,2,e


# get value array

In [30]:
df.values

array([[1.5, 0, 'a'],
       [1.7, 1, 'b'],
       [3.6, 2, 'c'],
       [2.4, 1, 'd'],
       [2.9, 2, 'e'],
       [3.2, 3, 'f']], dtype=object)

# get index

In [31]:
dfWithIndex.index

Index(['one', 'two', 'three', 'four', 'five', 'six'], dtype='object')

# get max index

In [32]:
maxIndexDf = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]])
maxIndexDf.idxmax()

0    2
1    2
2    2
dtype: int64

# reindex

In [33]:
column_reindex = pd.DataFrame(
    np.arange(9).reshape((3, 3)),
    index=['a', 'c', 'd'],
    columns=['Ohio', 'Texas', 'California']
)
print(column_reindex)
print("-------------------")
print(column_reindex.reindex(columns=['Texas', 'Utah', 'California']))

   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8
-------------------
   Texas  Utah  California
a      1   NaN           2
c      4   NaN           5
d      7   NaN           8


In [34]:
df.reindex(index=[1,3,5], columns=["float", "string", "not_exist"])

Unnamed: 0,float,string,not_exist
1,1.7,b,
3,2.4,d,
5,3.2,f,


# add

In [35]:
addDf_0_0 = pd.DataFrame([1,2,3], index=["a", "b", "c"])
addDf_0_1 = pd.DataFrame([20,30,40], index=["b", "c", "d"])
addDf_0_0 + addDf_0_1

Unnamed: 0,0
a,
b,22.0
c,33.0
d,


In [36]:
addDf_1_0 = pd.DataFrame([1,2,3], index=["a", "b", "c"])
addDf_1_1 = pd.DataFrame([20,30,40], index=["b", "c", "d"])
addDf_1_0.add(addDf_1_1, fill_value=0)

Unnamed: 0,0
a,1.0
b,22.0
c,33.0
d,40.0


# sum

In [37]:
sumDf = pd.DataFrame([[1,2,3], [4,None,6], [7,8,9]])
sumDf

Unnamed: 0,0,1,2
0,1,2.0,3
1,4,,6
2,7,8.0,9


In [38]:
sumDf.sum()

0    12.0
1    10.0
2    18.0
dtype: float64

In [39]:
sumDf.sum(skipna=False)

0    12.0
1     NaN
2    18.0
dtype: float64

# accumulation

In [40]:
accumulationDf = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]])
accumulationDf.cumsum()

Unnamed: 0,0,1,2
0,1,2,3
1,5,7,9
2,12,15,18


# describe

In [41]:
describeDf = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]])
describeDf.describe()

Unnamed: 0,0,1,2
count,3.0,3.0,3.0
mean,4.0,5.0,6.0
std,3.0,3.0,3.0
min,1.0,2.0,3.0
25%,2.5,3.5,4.5
50%,4.0,5.0,6.0
75%,5.5,6.5,7.5
max,7.0,8.0,9.0


# apply function

In [42]:
applyFunctionDf = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]])
applyFunctionDf

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [43]:
applyFunctionDf.apply(lambda x : x.max())

0    7
1    8
2    9
dtype: int64

In [44]:
applyFunctionDf.apply(lambda x : x.max(), axis='columns')

0    3
1    6
2    9
dtype: int64

In [45]:
applyFunctionDf.apply(lambda x : pd.Series([x.min(), x.max()], index=['min', 'max']))

Unnamed: 0,0,1,2
min,1,2,3
max,7,8,9


# apply map

In [46]:
applyMapDf = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]])
applyMapDf.applymap(lambda x : x * 10)

Unnamed: 0,0,1,2
0,10,20,30
1,40,50,60
2,70,80,90


# rank

In [47]:
rankDf = pd.DataFrame([[1,2,3], [4,5,6], [7,8,9]])

In [48]:
rankDf.rank()

Unnamed: 0,0,1,2
0,1.0,1.0,1.0
1,2.0,2.0,2.0
2,3.0,3.0,3.0


In [49]:
rankDf.rank(axis='columns')

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,1.0,2.0,3.0
2,1.0,2.0,3.0


# generate datetime index

In [50]:
dates = pd.date_range('1/1/2000', periods=7)
dates

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07'],
              dtype='datetime64[ns]', freq='D')

# drop NaN

In [51]:
dropNanDf = pd.DataFrame([
    [1.1, 2.2, 3.3], 
    [4.4, np.NaN, np.NaN],
    [np.NaN, np.NaN, np.NaN], 
    [np.NaN, 11.11, 12.12]
])
dropNanDf

Unnamed: 0,0,1,2
0,1.1,2.2,3.3
1,4.4,,
2,,,
3,,11.11,12.12


In [52]:
dropNanDf.dropna()

Unnamed: 0,0,1,2
0,1.1,2.2,3.3


In [53]:
dropNanDf.dropna(how='all')

Unnamed: 0,0,1,2
0,1.1,2.2,3.3
1,4.4,,
3,,11.11,12.12


In [54]:
dropNanDf.dropna(thresh=2)

Unnamed: 0,0,1,2
0,1.1,2.2,3.3
3,,11.11,12.12


# fill NaN

In [55]:
fillNanDf = pd.DataFrame([
    [1.1, 2.2, 3.3], 
    [4.4, np.NaN, np.NaN],
    [np.NaN, np.NaN, np.NaN], 
    [np.NaN, 11.11, 12.12]
])
fillNanDf

Unnamed: 0,0,1,2
0,1.1,2.2,3.3
1,4.4,,
2,,,
3,,11.11,12.12


In [56]:
fillNanDf.fillna(-1)

Unnamed: 0,0,1,2
0,1.1,2.2,3.3
1,4.4,-1.0,-1.0
2,-1.0,-1.0,-1.0
3,-1.0,11.11,12.12


In [57]:
fillNanDf.fillna({0: - 100, 1: -200, 2: -300})

Unnamed: 0,0,1,2
0,1.1,2.2,3.3
1,4.4,-200.0,-300.0
2,-100.0,-200.0,-300.0
3,-100.0,11.11,12.12


In [58]:
fillNanDf.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1.1,2.2,3.3
1,4.4,2.2,3.3
2,4.4,2.2,3.3
3,4.4,11.11,12.12


In [59]:
fillNanDf.fillna(method='ffill', limit=1)

Unnamed: 0,0,1,2
0,1.1,2.2,3.3
1,4.4,2.2,3.3
2,4.4,,
3,,11.11,12.12


In [60]:
removeDuplicateDf = pd.DataFrame({
    'col_0': ['one', 'two'] * 3 + ['two'],
    'col_1': [1, 1, 2, 3, 3, 4, 4]
})
removeDuplicateDf

Unnamed: 0,col_0,col_1
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [61]:
removeDuplicateDf.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [62]:
removeDuplicateDf.drop_duplicates()

Unnamed: 0,col_0,col_1
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [63]:
removeDuplicateDf.drop_duplicates(['col_0'])

Unnamed: 0,col_0,col_1
0,one,1
1,two,1


In [64]:
removeDuplicateDf.drop_duplicates(['col_0'], keep='last')

Unnamed: 0,col_0,col_1
4,one,3
6,two,4


# mapping data

In [65]:
mappingDataDf = pd.DataFrame({
    'col_0': ['one', 'two'] * 3 + ['two'],
    'col_1': [1, 1, 2, 3, 3, 4, 4]
})
mappingDataDf

Unnamed: 0,col_0,col_1
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [66]:
mappingData = {
    'one': '111',
    'two': '222'
}
mappingData

{'one': '111', 'two': '222'}

In [67]:
mappingDataDf['col_2'] = mappingDataDf['col_0'].map(mappingData)
mappingDataDf

Unnamed: 0,col_0,col_1,col_2
0,one,1,111
1,two,1,222
2,one,2,111
3,two,3,222
4,one,3,111
5,two,4,222
6,two,4,222


In [68]:
mappingDataDf['col_3'] = mappingDataDf['col_1'].map(lambda x : x * 10)
mappingDataDf

Unnamed: 0,col_0,col_1,col_2,col_3
0,one,1,111,10
1,two,1,222,10
2,one,2,111,20
3,two,3,222,30
4,one,3,111,30
5,two,4,222,40
6,two,4,222,40


# replace data

In [69]:
replaceDataDf = pd.DataFrame({
    'col_0': [0, 1, 2],
    'col_1': [1, 2, 3],
    'col_2': [2, 3, 4]
})
replaceDataDf

Unnamed: 0,col_0,col_1,col_2
0,0,1,2
1,1,2,3
2,2,3,4


In [70]:
replaceDataDf.replace({1: np.nan, 2: 999})

Unnamed: 0,col_0,col_1,col_2
0,0.0,,999.0
1,,999.0,3.0
2,999.0,3.0,4.0


# processing outliers

In [71]:
processingOutliersDf = pd.DataFrame(np.random.randn(1000, 4))
processingOutliersDf.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.034921,-0.017842,0.014417,0.038461
std,1.031438,1.022711,0.980768,1.004684
min,-3.993676,-3.288373,-2.892725,-3.477569
25%,-0.643743,-0.705336,-0.632616,-0.636609
50%,0.033528,-0.017096,0.018789,0.040806
75%,0.695115,0.68616,0.630422,0.709802
max,3.550214,3.150908,3.276822,4.361362


In [72]:
processingOutlierColDf = processingOutliersDf[0]
processingOutlierColDf[np.abs(processingOutlierColDf) > 3]

31    -3.058485
381   -3.014617
542   -3.343107
775    3.073383
816   -3.034427
874   -3.993676
927   -3.122839
956    3.550214
Name: 0, dtype: float64

In [73]:
processingOutliersDf[(np.abs(processingOutliersDf) > 3).any(1)]

Unnamed: 0,0,1,2,3
31,-3.058485,-0.069171,0.073978,-0.347657
320,-0.889191,-3.288373,0.567122,-1.119371
381,-3.014617,-0.337616,0.134533,-0.095705
414,-0.43945,-1.744088,3.042223,0.384761
437,1.173877,-1.151593,-0.184127,-3.053476
468,-0.436459,-0.505213,3.276822,0.201469
477,-1.665472,-1.245927,0.265301,-3.477569
542,-3.343107,-2.265821,-1.210241,1.396967
676,-1.378098,1.710112,-1.137601,4.361362
696,0.601909,1.023248,2.363619,-3.057646


In [74]:
processingOutliersDf[np.abs(processingOutliersDf) > 3]

Unnamed: 0,0,1,2,3
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,
5,,,,
6,,,,
7,,,,
8,,,,
9,,,,


In [75]:
processingOutliersDf[np.abs(processingOutliersDf) > 3] = np.sign(processingOutliersDf) * 3
processingOutliersDf

Unnamed: 0,0,1,2,3
0,0.508033,0.526342,-0.763859,-0.345797
1,1.017197,-0.117751,-0.731588,-0.659518
2,2.034641,-0.106188,1.257125,0.409174
3,-0.815705,-1.979807,0.340740,0.505188
4,-1.273810,-0.727365,0.193305,1.166887
5,0.700998,0.140015,0.915945,-1.322417
6,-1.142149,-1.684615,-1.158275,1.324193
7,-0.442781,-1.728015,0.687426,-1.402684
8,-1.235615,1.039554,-1.724768,0.822804
9,0.452813,-0.096094,1.285378,1.750083


# sampling data

In [76]:
samplingDataDf = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))
samplingDataDf

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [77]:
samplingDataSampler = np.random.permutation(5)
samplingDataSampler

array([3, 0, 2, 1, 4])

In [78]:
samplingDataDf.take(samplingDataSampler)

Unnamed: 0,0,1,2,3
3,12,13,14,15
0,0,1,2,3
2,8,9,10,11
1,4,5,6,7
4,16,17,18,19


In [79]:
samplingDataDf.sample(n=3)

Unnamed: 0,0,1,2,3
3,12,13,14,15
1,4,5,6,7
4,16,17,18,19


# dummy variable

In [80]:
dummyVariableDf = pd.DataFrame({'col_0': ['b', 'b', 'a', 'c', 'a', 'b']})
dummyVariableDf

Unnamed: 0,col_0
0,b
1,b
2,a
3,c
4,a
5,b


In [81]:
pd.get_dummies(dummyVariableDf['col_0'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [82]:
dummyVariable = np.random.rand(10)
dummyVariable

array([0.88026046, 0.76808572, 0.84455519, 0.47158661, 0.54280147,
       0.55388574, 0.95287893, 0.9181998 , 0.53634656, 0.01008164])

In [83]:
dummyVariableBins = [0, 0.2, 0.4, 0.6, 0.8, 1]
dummyVariableBins

[0, 0.2, 0.4, 0.6, 0.8, 1]

In [84]:
pd.get_dummies(pd.cut(dummyVariable, dummyVariableBins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,0,0,1,0
2,0,0,0,0,1
3,0,0,1,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,0,1
8,0,0,1,0,0
9,1,0,0,0,0


# string function

In [85]:
stringFunctionDf = pd.DataFrame({
    'col_0': ['abc', 'xyz', '123']
})
stringFunctionDf

Unnamed: 0,col_0
0,abc
1,xyz
2,123


In [86]:
stringFunctionDf['col_0'].str.contains('xy')

0    False
1     True
2    False
Name: col_0, dtype: bool

In [87]:
stringFunctionDf['col_0'].str.findall('[a1]', flags=re.IGNORECASE)

0    [a]
1     []
2    [1]
Name: col_0, dtype: object

In [88]:
stringFunctionDf['col_0'].str.match('[a1]', flags=re.IGNORECASE)

0     True
1    False
2     True
Name: col_0, dtype: bool

# hierarchical indexing

In [89]:
hierarchicalIndexing = pd.DataFrame(
    np.arange(12).reshape((4, 3)),
    index=[
        ['a', 'a', 'b', 'b'], 
        [1, 2, 1, 2]
    ],
    columns=[
        ['col_A', 'col_A', 'col_B'],
        ['col_0', 'col_1', 'col_2']
    ]
)
hierarchicalIndexing

Unnamed: 0_level_0,Unnamed: 1_level_0,col_A,col_A,col_B
Unnamed: 0_level_1,Unnamed: 1_level_1,col_0,col_1,col_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [90]:
hierarchicalIndexing["col_A"]["col_0"]["a"][2]

3

In [91]:
hierarchicalIndexing.index.names = ['idx_0', 'idx_1']
hierarchicalIndexing.columns.names = ['c_0', 'c_1']
hierarchicalIndexing

Unnamed: 0_level_0,c_0,col_A,col_A,col_B
Unnamed: 0_level_1,c_1,col_0,col_1,col_2
idx_0,idx_1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [92]:
hierarchicalIndexing.swaplevel("idx_0", "idx_1")

Unnamed: 0_level_0,c_0,col_A,col_A,col_B
Unnamed: 0_level_1,c_1,col_0,col_1,col_2
idx_1,idx_0,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [93]:
hierarchicalIndexing.sort_index(level=1)

Unnamed: 0_level_0,c_0,col_A,col_A,col_B
Unnamed: 0_level_1,c_1,col_0,col_1,col_2
idx_0,idx_1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [94]:
hierarchicalIndexing.sum(level="idx_0")

c_0,col_A,col_A,col_B
c_1,col_0,col_1,col_2
idx_0,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
a,3,5,7
b,15,17,19


# indexing with columns

In [95]:
indexingWithColumns = pd.DataFrame({
    'a': range(7), 
    'b': range(7, 0, -1),
    'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
    'd': [0, 1, 2, 0, 1, 2, 3]
})
indexingWithColumns

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [96]:
indexingWithColumns.set_index(['c', 'd'])

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [97]:
indexingWithColumns.set_index(['c', 'd'], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


# reset index

In [98]:
resetIndex = pd.DataFrame(
    np.arange(12).reshape((4, 3)),
    index=[
        ['a', 'a', 'b', 'b'], 
        [1, 2, 1, 2]
    ],
    columns=[
        ['col_A', 'col_A', 'col_B'],
        ['col_0', 'col_1', 'col_2']
    ]
)
resetIndex

Unnamed: 0_level_0,Unnamed: 1_level_0,col_A,col_A,col_B
Unnamed: 0_level_1,Unnamed: 1_level_1,col_0,col_1,col_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [99]:
resetIndexResult = resetIndex.reset_index()
resetIndexResult

Unnamed: 0_level_0,level_0,level_1,col_A,col_A,col_B
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,col_0,col_1,col_2
0,a,1,0,1,2
1,a,2,3,4,5
2,b,1,6,7,8
3,b,2,9,10,11


# join

In [118]:
joinDf_0 = pd.DataFrame({
    'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
    'data': range(7)
})
joinDf_0

Unnamed: 0,data,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,a
6,6,b


In [119]:
joinDf_1 = pd.DataFrame({
    'key': ['a', 'b', 'd'],
    'data': range(3)
})
joinDf_1

Unnamed: 0,data,key
0,0,a
1,1,b
2,2,d


In [121]:
pd.merge(joinDf_0, joinDf_1, on="key", suffixes=('_left', '_right'), how="outer")

Unnamed: 0,data_left,key,data_right
0,0.0,b,1.0
1,1.0,b,1.0
2,6.0,b,1.0
3,2.0,a,0.0
4,4.0,a,0.0
5,5.0,a,0.0
6,3.0,c,
7,,d,2.0


In [109]:
joinDf_2 = pd.DataFrame({
    'key_2': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
    'data_2': range(7)
})
joinDf_2

Unnamed: 0,data_2,key_2
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,a
6,6,b


In [110]:
joinDf_3 = pd.DataFrame({
    'key_3': ['a', 'b', 'd'],
    'data_3': range(3)
})
joinDf_3

Unnamed: 0,data_3,key_3
0,0,a
1,1,b
2,2,d


In [112]:
pd.merge(joinDf_2, joinDf_3, left_on='key_2', right_on='key_3', how="outer")

Unnamed: 0,data_2,key_2,data_3,key_3
0,0.0,b,1.0,b
1,1.0,b,1.0,b
2,6.0,b,1.0,b
3,2.0,a,0.0,a
4,4.0,a,0.0,a
5,5.0,a,0.0,a
6,3.0,c,,
7,,,2.0,d


In [113]:
joinDf_4 = pd.DataFrame({
    'key_4_0': ['a', 'a', 'b'],
    'key_4_1': [1, 2, 1],
    'val_4': ['A', 'B', 'C']
})
joinDf_4

Unnamed: 0,key_4_0,key_4_1,val_4
0,a,1,1
1,a,2,2
2,b,1,3


In [114]:
joinDf_5 = pd.DataFrame({
    'key_5_0': ['a', 'a', 'b', 'b'],
    'key_5_1': [1, 1, 1, 2],
    'val_5': ['D', 'E', 'F', 'G']
})
joinDf_5

Unnamed: 0,key_5_0,key_5_1,val_5
0,a,1,D
1,a,1,E
2,b,1,F
3,b,2,G


In [117]:
pd.merge(joinDf_4, joinDf_5, left_on=['key_4_0', 'key_4_1'], right_on=['key_5_0', 'key_5_1'], how='outer')

Unnamed: 0,key_4_0,key_4_1,val_4,key_5_0,key_5_1,val_5
0,a,1.0,1.0,a,1.0,D
1,a,1.0,1.0,a,1.0,E
2,a,2.0,2.0,,,
3,b,1.0,3.0,b,1.0,F
4,,,,b,2.0,G


In [122]:
joinDf_6 = pd.DataFrame({
    'key_6': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
    'data_6': range(7)
})
joinDf_6

Unnamed: 0,data_6,key_6
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,a
6,6,b


In [124]:
joinDf_7 = pd.DataFrame({
    'data_7': range(3),
},
    index=['a', 'b', 'd'],
)
joinDf_7

Unnamed: 0,data_7
a,0
b,1
d,2


In [127]:
pd.merge(joinDf_6, joinDf_7, left_on='key_6', right_index=True, how='outer')

Unnamed: 0,data_6,key_6,data_7
0,0.0,b,1.0
1,1.0,b,1.0
6,6.0,b,1.0
2,2.0,a,0.0
4,4.0,a,0.0
5,5.0,a,0.0
3,3.0,c,
6,,d,2.0


In [128]:
joinDf_8 = pd.DataFrame({
    'key_8_0': ['a', 'a', 'b'],
    'key_8_1': [1, 2, 1],
    'val_8': ['A', 'B', 'C']
})
joinDf_8

Unnamed: 0,key_8_0,key_8_1,val_8
0,a,1,A
1,a,2,B
2,b,1,C


In [130]:
joinDf_9 = pd.DataFrame(
    data={
        'val_9': ['D', 'E', 'F', 'G']
    },
    index=[
        ['a', 'a', 'b', 'b'],
        [1, 1, 1, 2]
    ]
)
joinDf_9

Unnamed: 0,Unnamed: 1,val_9
a,1,D
a,1,E
b,1,F
b,2,G


In [137]:
pd.merge(joinDf_8, joinDf_9, left_on=['key_8_0', 'key_8_1'], right_index=True, how="outer")

Unnamed: 0,key_8_0,key_8_1,val_8,val_9
0,a,1,A,D
0,a,1,A,E
1,a,2,B,
2,b,1,C,F
2,b,2,,G


In [135]:
joinDf_10 = pd.DataFrame(
    data={
        'val_10': ['A', 'B', 'C']
    },
    index=[
        ['a', 'a', 'b'],
        [1, 2, 1]
    ]
)
joinDf_10

Unnamed: 0,Unnamed: 1,val_10
a,1,A
a,2,B
b,1,C


In [136]:
joinDf_11 = pd.DataFrame(
    data={
        'val_11': ['D', 'E', 'F', 'G']
    },
    index=[
        ['a', 'a', 'b', 'b'],
        [1, 1, 1, 2]
    ]
)
joinDf_11

Unnamed: 0,Unnamed: 1,val_11
a,1,D
a,1,E
b,1,F
b,2,G


In [138]:
pd.merge(joinDf_10, joinDf_11, left_index=True, right_index=True, how="outer")

Unnamed: 0,Unnamed: 1,val_10,val_11
a,1,A,D
a,1,A,E
a,2,B,
b,1,C,F
b,2,,G


# concate

In [139]:
concateDf_0 = pd.DataFrame(
    data=np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
    columns=['one', 'two']
)
concateDf_0

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [140]:
concateDf_1 = pd.DataFrame(
    data=5 + np.arange(4).reshape(2, 2), 
    index=['a', 'c'],
    columns=['three', 'four']
)
concateDf_1

Unnamed: 0,three,four
a,5,6
c,7,8


In [142]:
pd.concat([concateDf_0, concateDf_1], axis=1, keys=['df_0', 'df_1'])

Unnamed: 0_level_0,df_0,df_0,df_1,df_1
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [143]:
concateDf_2 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
concateDf_2

Unnamed: 0,a,b,c,d
0,2.304217,-2.472724,0.065083,-0.198273
1,-1.186353,0.726974,0.761043,1.531056
2,1.703795,-0.04221,-1.078099,0.182274


In [144]:
concateDf_3 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])
concateDf_3

Unnamed: 0,b,d,a
0,-0.649204,0.030192,2.320603
1,-0.416082,-0.048962,-0.185832


In [147]:
pd.concat([concateDf_2, concateDf_3], ignore_index=True)

Unnamed: 0,a,b,c,d
0,2.304217,-2.472724,0.065083,-0.198273
1,-1.186353,0.726974,0.761043,1.531056
2,1.703795,-0.04221,-1.078099,0.182274
3,2.320603,-0.649204,,0.030192
4,-0.185832,-0.416082,,-0.048962
