## Table of Content
1. Concatenate
2. Merging and Joining
3. Reshaping
4. Pivot Table
5. Duplicate
6. Map and Replace
7. Groupby in Pandas
8. Summary Statistics

In [1]:
import pandas as pd
import numpy as np

# Series

In [2]:
x = np.linspace(0,20,11,dtype=int)
y = np.linspace(1,21,11,dtype=int)

In [3]:
x, y

(array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20]),
 array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21]))

In [4]:
x_series = pd.Series(x)
y_series = pd.Series(y)

pd.concat([x_series, y_series])

0      0
1      2
2      4
3      6
4      8
5     10
6     12
7     14
8     16
9     18
10    20
0      1
1      3
2      5
3      7
4      9
5     11
6     13
7     15
8     17
9     19
10    21
dtype: int32

In [5]:
pd.concat([x_series, y_series], keys= ['Even','Odd'], names=['Category','Index'])

Category  Index
Even      0         0
          1         2
          2         4
          3         6
          4         8
          5        10
          6        12
          7        14
          8        16
          9        18
          10       20
Odd       0         1
          1         3
          2         5
          3         7
          4         9
          5        11
          6        13
          7        15
          8        17
          9        19
          10       21
dtype: int32

In [11]:
data = pd.concat([x_series, y_series], keys= ['Even','Odd'], names=['Category','Index'])
a = data.unstack().T

In [12]:
a

Category,Even,Odd
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,1
1,2,3
2,4,5
3,6,7
4,8,9
5,10,11
6,12,13
7,14,15
8,16,17
9,18,19


In [16]:
type(a)

pandas.core.frame.DataFrame

In [17]:
a.columns

Index(['Even', 'Odd'], dtype='object', name='Category')

In [18]:
b = data.unstack()
b

Index,0,1,2,3,4,5,6,7,8,9,10
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Even,0,2,4,6,8,10,12,14,16,18,20
Odd,1,3,5,7,9,11,13,15,17,19,21


In [19]:
b.stack().unstack(level=-2)

Category,Even,Odd
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,1
1,2,3
2,4,5
3,6,7
4,8,9
5,10,11
6,12,13
7,14,15
8,16,17
9,18,19


# Data Frames

In [20]:
df1 = pd.read_excel('example.xlsx',sheet_name=0)
df2 = pd.read_excel('example.xlsx',sheet_name=1)

In [21]:
df1.shape

(5, 4)

In [22]:
df2.shape

(6, 4)

In [23]:
df1

Unnamed: 0,Age,Gender,Salary,City_residence
0,45.0,Male,40000,
1,12.0,Male,0,Bangalore
2,,Female,150000,Bangalore
3,26.0,Male,30000,Chennai
4,64.0,Female,15000,Chennai


In [24]:
df2

Unnamed: 0,Age,Gender,Salary,City_residence
0,45,Male,88900,
1,23,,18000,Mumbai
2,67,Male,92000,Mumbai
3,34,Male,180000,Delhi
4,67,Male,92000,Mumbai
5,34,Male,180000,Delhi


In [25]:
pd.concat([df1,df2])

Unnamed: 0,Age,Gender,Salary,City_residence
0,45.0,Male,40000,
1,12.0,Male,0,Bangalore
2,,Female,150000,Bangalore
3,26.0,Male,30000,Chennai
4,64.0,Female,15000,Chennai
0,45.0,Male,88900,
1,23.0,,18000,Mumbai
2,67.0,Male,92000,Mumbai
3,34.0,Male,180000,Delhi
4,67.0,Male,92000,Mumbai


In [26]:
pd.concat([df1,df2,df1])

Unnamed: 0,Age,Gender,Salary,City_residence
0,45.0,Male,40000,
1,12.0,Male,0,Bangalore
2,,Female,150000,Bangalore
3,26.0,Male,30000,Chennai
4,64.0,Female,15000,Chennai
0,45.0,Male,88900,
1,23.0,,18000,Mumbai
2,67.0,Male,92000,Mumbai
3,34.0,Male,180000,Delhi
4,67.0,Male,92000,Mumbai


In [27]:
pd.concat([df1,df2],axis=1)   # Will concatenate matching the index

Unnamed: 0,Age,Gender,Salary,City_residence,Age.1,Gender.1,Salary.1,City_residence.1
0,45.0,Male,40000.0,,45,Male,88900,
1,12.0,Male,0.0,Bangalore,23,,18000,Mumbai
2,,Female,150000.0,Bangalore,67,Male,92000,Mumbai
3,26.0,Male,30000.0,Chennai,34,Male,180000,Delhi
4,64.0,Female,15000.0,Chennai,67,Male,92000,Mumbai
5,,,,,34,Male,180000,Delhi


In [28]:
pd.concat([df1,df2], ignore_index=True)    # will reset the index

Unnamed: 0,Age,Gender,Salary,City_residence
0,45.0,Male,40000,
1,12.0,Male,0,Bangalore
2,,Female,150000,Bangalore
3,26.0,Male,30000,Chennai
4,64.0,Female,15000,Chennai
5,45.0,Male,88900,
6,23.0,,18000,Mumbai
7,67.0,Male,92000,Mumbai
8,34.0,Male,180000,Delhi
9,67.0,Male,92000,Mumbai


In [29]:
pd.merge(df1,df2, on = 'City_residence')   # By default it is inner join

Unnamed: 0,Age_x,Gender_x,Salary_x,City_residence,Age_y,Gender_y,Salary_y
0,45.0,Male,40000,,45,Male,88900


In [30]:
pd.merge(df1,df2, on = 'City_residence', how='outer')

Unnamed: 0,Age_x,Gender_x,Salary_x,City_residence,Age_y,Gender_y,Salary_y
0,45.0,Male,40000.0,,45.0,Male,88900.0
1,12.0,Male,0.0,Bangalore,,,
2,,Female,150000.0,Bangalore,,,
3,26.0,Male,30000.0,Chennai,,,
4,64.0,Female,15000.0,Chennai,,,
5,,,,Mumbai,23.0,,18000.0
6,,,,Mumbai,67.0,Male,92000.0
7,,,,Mumbai,67.0,Male,92000.0
8,,,,Delhi,34.0,Male,180000.0
9,,,,Delhi,34.0,Male,180000.0


In [31]:
pd.merge(df1,df2, right_index=True, left_index=True)

Unnamed: 0,Age_x,Gender_x,Salary_x,City_residence_x,Age_y,Gender_y,Salary_y,City_residence_y
0,45.0,Male,40000,,45,Male,88900,
1,12.0,Male,0,Bangalore,23,,18000,Mumbai
2,,Female,150000,Bangalore,67,Male,92000,Mumbai
3,26.0,Male,30000,Chennai,34,Male,180000,Delhi
4,64.0,Female,15000,Chennai,67,Male,92000,Mumbai


## Hierarchical Indexing

In [32]:
dataframe = pd.DataFrame(
    np.arange(12).reshape((4,3)),
    index = [['Class_Test','Class_Test','Sem_Exam','Sem_Exam'],[1,2,1,2]],
    columns = [['Aria','Aria','John'],['Maths','English','Maths']]
)

In [33]:
dataframe

Unnamed: 0_level_0,Unnamed: 1_level_0,Aria,Aria,John
Unnamed: 0_level_1,Unnamed: 1_level_1,Maths,English,Maths
Class_Test,1,0,1,2
Class_Test,2,3,4,5
Sem_Exam,1,6,7,8
Sem_Exam,2,9,10,11


In [34]:
dataframe.index.names = ['Key1','Key2']
dataframe.columns.names = ['Name','Subject']

In [35]:
dataframe

Unnamed: 0_level_0,Name,Aria,Aria,John
Unnamed: 0_level_1,Subject,Maths,English,Maths
Key1,Key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Class_Test,1,0,1,2
Class_Test,2,3,4,5
Sem_Exam,1,6,7,8
Sem_Exam,2,9,10,11


In [36]:
dataframe.unstack()

Name,Aria,Aria,Aria,Aria,John,John
Subject,Maths,Maths,English,English,Maths,Maths
Key2,1,2,1,2,1,2
Key1,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
Class_Test,0,3,1,4,2,5
Sem_Exam,6,9,7,10,8,11


In [37]:
dataframe.unstack(-2)

Name,Aria,Aria,Aria,Aria,John,John
Subject,Maths,Maths,English,English,Maths,Maths
Key1,Class_Test,Sem_Exam,Class_Test,Sem_Exam,Class_Test,Sem_Exam
Key2,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
1,0,6,1,7,2,8
2,3,9,4,10,5,11


In [38]:
dataframe.stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Aria,John
Key1,Key2,Subject,Unnamed: 3_level_1,Unnamed: 4_level_1
Class_Test,1,English,1,
Class_Test,1,Maths,0,2.0
Class_Test,2,English,4,
Class_Test,2,Maths,3,5.0
Sem_Exam,1,English,7,
Sem_Exam,1,Maths,6,8.0
Sem_Exam,2,English,10,
Sem_Exam,2,Maths,9,11.0


In [39]:
dataframe.stack(-2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Subject,Maths,English
Key1,Key2,Name,Unnamed: 3_level_1,Unnamed: 4_level_1
Class_Test,1,Aria,0,1.0
Class_Test,1,John,2,
Class_Test,2,Aria,3,4.0
Class_Test,2,John,5,
Sem_Exam,1,Aria,6,7.0
Sem_Exam,1,John,8,
Sem_Exam,2,Aria,9,10.0
Sem_Exam,2,John,11,


In [40]:
dataframe

Unnamed: 0_level_0,Name,Aria,Aria,John
Unnamed: 0_level_1,Subject,Maths,English,Maths
Key1,Key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Class_Test,1,0,1,2
Class_Test,2,3,4,5
Sem_Exam,1,6,7,8
Sem_Exam,2,9,10,11


In [41]:
dataframe

Unnamed: 0_level_0,Name,Aria,Aria,John
Unnamed: 0_level_1,Subject,Maths,English,Maths
Key1,Key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Class_Test,1,0,1,2
Class_Test,2,3,4,5
Sem_Exam,1,6,7,8
Sem_Exam,2,9,10,11


In [42]:
a = dataframe.stack()

In [43]:
a

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Aria,John
Key1,Key2,Subject,Unnamed: 3_level_1,Unnamed: 4_level_1
Class_Test,1,English,1,
Class_Test,1,Maths,0,2.0
Class_Test,2,English,4,
Class_Test,2,Maths,3,5.0
Sem_Exam,1,English,7,
Sem_Exam,1,Maths,6,8.0
Sem_Exam,2,English,10,
Sem_Exam,2,Maths,9,11.0


In [44]:
a.melt(id_vars=['Aria'])

Unnamed: 0,Aria,Name,value
0,1,John,
1,0,John,2.0
2,4,John,
3,3,John,5.0
4,7,John,
5,6,John,8.0
6,10,John,
7,9,John,11.0


In [45]:
data_cust = pd.DataFrame(
    {
        "customerID" : ["101", "102", "103", "104"],
        "category" : ["Medium", "Medium", "High", "Low"],
        "first_visit" : ["yes", "no", "yes", "yes"],
        "sales": [123,52,214,663]
    },
    index=[0,1,2,3]
)

In [46]:
data_cust

Unnamed: 0,customerID,category,first_visit,sales
0,101,Medium,yes,123
1,102,Medium,no,52
2,103,High,yes,214
3,104,Low,yes,663


In [47]:
pd.pivot_table(data_cust, index=['category'],values=['sales'])   # Default aggregate function applied is mean

Unnamed: 0_level_0,sales
category,Unnamed: 1_level_1
High,214.0
Low,663.0
Medium,87.5


In [48]:
pd.pivot_table(data_cust, index=['category'],values=['sales'], aggfunc='sum')

Unnamed: 0_level_0,sales
category,Unnamed: 1_level_1
High,214
Low,663
Medium,175


In [49]:
df3 = pd.concat([df1,df2])
df3

Unnamed: 0,Age,Gender,Salary,City_residence
0,45.0,Male,40000,
1,12.0,Male,0,Bangalore
2,,Female,150000,Bangalore
3,26.0,Male,30000,Chennai
4,64.0,Female,15000,Chennai
0,45.0,Male,88900,
1,23.0,,18000,Mumbai
2,67.0,Male,92000,Mumbai
3,34.0,Male,180000,Delhi
4,67.0,Male,92000,Mumbai


In [50]:
df3.duplicated()

0    False
1    False
2    False
3    False
4    False
0    False
1    False
2    False
3    False
4     True
5     True
dtype: bool

In [51]:
df3.duplicated().value_counts()

False    9
True     2
Name: count, dtype: int64

In [52]:
df3.drop_duplicates()

Unnamed: 0,Age,Gender,Salary,City_residence
0,45.0,Male,40000,
1,12.0,Male,0,Bangalore
2,,Female,150000,Bangalore
3,26.0,Male,30000,Chennai
4,64.0,Female,15000,Chennai
0,45.0,Male,88900,
1,23.0,,18000,Mumbai
2,67.0,Male,92000,Mumbai
3,34.0,Male,180000,Delhi


In [53]:
genmap = {'Male':'M','Female':'F'}

In [54]:
df3['Gen1'] = df3['Gender'].map(genmap)

In [55]:
df3

Unnamed: 0,Age,Gender,Salary,City_residence,Gen1
0,45.0,Male,40000,,M
1,12.0,Male,0,Bangalore,M
2,,Female,150000,Bangalore,F
3,26.0,Male,30000,Chennai,M
4,64.0,Female,15000,Chennai,F
0,45.0,Male,88900,,M
1,23.0,,18000,Mumbai,
2,67.0,Male,92000,Mumbai,M
3,34.0,Male,180000,Delhi,M
4,67.0,Male,92000,Mumbai,M


In [56]:
genmap1 = {'M':'m','F':'f',np.nan:'Others'}

In [57]:
df3['Gen1'].replace(genmap1, inplace=True)

In [58]:
df3

Unnamed: 0,Age,Gender,Salary,City_residence,Gen1
0,45.0,Male,40000,,m
1,12.0,Male,0,Bangalore,m
2,,Female,150000,Bangalore,f
3,26.0,Male,30000,Chennai,m
4,64.0,Female,15000,Chennai,f
0,45.0,Male,88900,,m
1,23.0,,18000,Mumbai,Others
2,67.0,Male,92000,Mumbai,m
3,34.0,Male,180000,Delhi,m
4,67.0,Male,92000,Mumbai,m


In [59]:
df3.groupby('City_residence')['Gender'].nunique()

City_residence
Bangalore    2
Chennai      2
Delhi        1
Mumbai       1
Name: Gender, dtype: int64

In [60]:
df3.groupby('City_residence')['Gender'].nunique().to_frame()

Unnamed: 0_level_0,Gender
City_residence,Unnamed: 1_level_1
Bangalore,2
Chennai,2
Delhi,1
Mumbai,1


In [61]:
df3.groupby('City_residence')['Gender'].nunique().to_frame().reset_index()

Unnamed: 0,City_residence,Gender
0,Bangalore,2
1,Chennai,2
2,Delhi,1
3,Mumbai,1


In [62]:
df4 = df3[['Age','Salary']]

In [63]:
df4

Unnamed: 0,Age,Salary
0,45.0,40000
1,12.0,0
2,,150000
3,26.0,30000
4,64.0,15000
0,45.0,88900
1,23.0,18000
2,67.0,92000
3,34.0,180000
4,67.0,92000


In [64]:
df4.skew()

Age       0.114251
Salary    0.447557
dtype: float64

In [65]:
df4.kurtosis()

Age      -1.219850
Salary   -1.264961
dtype: float64

In [66]:
df3

Unnamed: 0,Age,Gender,Salary,City_residence,Gen1
0,45.0,Male,40000,,m
1,12.0,Male,0,Bangalore,m
2,,Female,150000,Bangalore,f
3,26.0,Male,30000,Chennai,m
4,64.0,Female,15000,Chennai,f
0,45.0,Male,88900,,m
1,23.0,,18000,Mumbai,Others
2,67.0,Male,92000,Mumbai,m
3,34.0,Male,180000,Delhi,m
4,67.0,Male,92000,Mumbai,m


In [67]:
df4

Unnamed: 0,Age,Salary
0,45.0,40000
1,12.0,0
2,,150000
3,26.0,30000
4,64.0,15000
0,45.0,88900
1,23.0,18000
2,67.0,92000
3,34.0,180000
4,67.0,92000


In [70]:
pd.crosstab(values=[df3,df4])

TypeError: crosstab() missing 2 required positional arguments: 'index' and 'columns'

In [74]:
a = np.array(["foo",  "foo",  "foo",   "foo",  "bar",  "bar",   "bar",   "bar",  "foo",   "foo",   "foo"], dtype=object)
b = np.array(["one",  "one",  "one",   "two",  "one",  "one",   "one",   "two",  "two",   "two",   "one"], dtype=object)
c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny", "shiny", "dull", "shiny", "shiny", "shiny"], dtype=object)

# form the cross tab
pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])

b,one,one,two,two
c,dull,shiny,dull,shiny
a,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,1,2,1,0
foo,2,2,1,2
