Series

In [1]:
import pandas as pd 
import numpy as np

In [2]:
data = pd.Series(['a','b','c','d'])

In [3]:
data

0    a
1    b
2    c
3    d
dtype: object

In [4]:
data[1]

'b'

In [5]:
data.values

array(['a', 'b', 'c', 'd'], dtype=object)

In [6]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
num = pd.Series([0.2,0.3,0.1,0.7],index = ['a','b','c','d'])

In [8]:
num

a    0.2
b    0.3
c    0.1
d    0.7
dtype: float64

In [9]:
num['b']

0.3

In [10]:
covid_dict = {'usa':10,'india':6,'china':7,'russia':8,'brazil':9}

In [11]:
codata = pd.Series(covid_dict)

In [12]:
codata

usa       10
india      6
china      7
russia     8
brazil     9
dtype: int64

In [13]:
codata['usa':'china']

usa      10
india     6
china     7
dtype: int64

Dataframes

In [14]:
covid_area = {'usa':'texas','india':'mumbai','china':'wuham','russia':'ussr','brazil':'rio'}

In [15]:
coarea = pd.Series(covid_area)

In [16]:
codataframe = pd.DataFrame({'covid_pop':codata,'covid_location':coarea})

In [17]:
codataframe

Unnamed: 0,covid_pop,covid_location
usa,10,texas
india,6,mumbai
china,7,wuham
russia,8,ussr
brazil,9,rio


In [18]:
codataframe.index

Index(['usa', 'india', 'china', 'russia', 'brazil'], dtype='object')

In [19]:
codataframe.columns

Index(['covid_pop', 'covid_location'], dtype='object')

In [20]:
#Constructing dataframe
#1.from a single series data

In [21]:
pd.DataFrame(coarea,columns=['areas'])

Unnamed: 0,areas
usa,texas
india,mumbai
china,wuham
russia,ussr
brazil,rio


In [22]:
#2.From a numpy structured array

In [23]:
A = np.zeros(3,dtype = [('a','i8'),('b','f8')] )

In [24]:
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('a', '<i8'), ('b', '<f8')])

In [25]:
A = pd.DataFrame(A)

In [26]:
A

Unnamed: 0,a,b
0,0,0.0
1,0,0.0
2,0,0.0


In [27]:
#HANDLING MISSING DATA

In [28]:
#nan- NOT A NUMBER with numpy

In [29]:
import numpy as np 
1 + np.nan

nan

In [30]:
0*np.nan

nan

In [31]:
val = np.array([1,np.nan,2,5])

In [32]:
val.sum()

nan

In [33]:
val.max()

  return umr_maximum(a, axis, None, out, keepdims, initial)


nan

In [34]:
#so if you want to add the data in a array containing nan
np.nansum(val)

8.0

In [35]:
np.nanmax(val)

5.0

In [36]:
#NaN and None in pandas

In [37]:
import pandas as pd 
pd.Series([1,np.nan,2,None]) #Pandas automatically converts None to Nan values

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [38]:
x = pd.Series(range(2),dtype = int)
x

0    0
1    1
dtype: int32

In [39]:
x[0] = None
x

0    NaN
1    1.0
dtype: float64

Operation on missing values

isnull()
notnull()
dropna()
fillna()


In [40]:
#Detecting null values

In [41]:
data = pd.Series([1,np.nan,'hello',None])

In [42]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [43]:
data[data.notnull()]

0        1
2    hello
dtype: object

In [44]:
#Dropping nan values

In [45]:
df = pd.DataFrame([[1,np.nan,15],
                 [13,2,np.nan],
                 [5,3,9]])
df

Unnamed: 0,0,1,2
0,1,,15.0
1,13,2.0,
2,5,3.0,9.0


In [46]:
df.dropna() #drops all the rows and colunms with nan values

Unnamed: 0,0,1,2
2,5,3.0,9.0


In [47]:
 df.dropna(axis=1) #drop all nan colunms 

Unnamed: 0,0
0,1
1,13
2,5


In [48]:
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1,,15.0,
1,13,2.0,,
2,5,3.0,9.0,


In [49]:
df.dropna(axis=1,how="all")#drop columns which contains all nan

Unnamed: 0,0,1,2
0,1,,15.0
1,13,2.0,
2,5,3.0,9.0


In [50]:
df.dropna(axis=1,how="any")#drop columns which contains atleast one nan

Unnamed: 0,0
0,1
1,13
2,5


In [51]:
df.dropna(axis=0,thresh = 3)#thresh defines minimum number of non-null values to be kept 

Unnamed: 0,0,1,2,3
2,5,3.0,9.0,


In [52]:
#fill the nan values

In [53]:
data = pd.Series([1,np.nan,23,9,np.nan])
data

0     1.0
1     NaN
2    23.0
3     9.0
4     NaN
dtype: float64

In [54]:
data.fillna(0) #fills the nan values with 0 

0     1.0
1     0.0
2    23.0
3     9.0
4     0.0
dtype: float64

In [55]:
data.fillna(method="ffill") # fills with forward fill(pervious value)

0     1.0
1     1.0
2    23.0
3     9.0
4     9.0
dtype: float64

In [56]:
data.fillna(method="bfill") #fill with backward fill

0     1.0
1    23.0
2    23.0
3     9.0
4     NaN
dtype: float64

In [57]:
#For dataframes, simillar options but can use which axis also
df

Unnamed: 0,0,1,2,3
0,1,,15.0,
1,13,2.0,,
2,5,3.0,9.0,


In [58]:
df.fillna(method="ffill",axis=1)

Unnamed: 0,0,1,2,3
0,1.0,1.0,15.0,15.0
1,13.0,2.0,2.0,2.0
2,5.0,3.0,9.0,9.0


Hierarchical Indexing (or) Multi-Indexing

The dataset with three or more dimentions, the dataset with more than one index is called as multi-indexing. Each index is a dimention here.

In [59]:
import pandas as pd
import numpy as np

In [60]:
index = [('Kerala',2000),('Kerala',2010),
        ('Tamilnadu',2000),('Tamilnadu',2010),
        ('Goa',2000),('Goa',2010)]
pops = [2198612,173728911,
      42718627,9172892718,
      1212123,1625372512]

In [61]:
pop = pd.Series(pops,index=index) #Bad way of representing data
pop

(Kerala, 2000)          2198612
(Kerala, 2010)        173728911
(Tamilnadu, 2000)      42718627
(Tamilnadu, 2010)    9172892718
(Goa, 2000)             1212123
(Goa, 2010)          1625372512
dtype: int64

In [62]:
index = pd.MultiIndex.from_tuples(index)
index

MultiIndex(levels=[['Goa', 'Kerala', 'Tamilnadu'], [2000, 2010]],
           codes=[[1, 1, 2, 2, 0, 0], [0, 1, 0, 1, 0, 1]])

In [63]:
pop = pop.reindex(index)
pop

Kerala     2000       2198612
           2010     173728911
Tamilnadu  2000      42718627
           2010    9172892718
Goa        2000       1212123
           2010    1625372512
dtype: int64

In [64]:
pop[:,2010]

Kerala        173728911
Tamilnadu    9172892718
Goa          1625372512
dtype: int64

In [65]:
#unstack() -- converts multiply indexed series into a conventionally indexed Dataframe

In [66]:
pop_df = pop.unstack()
pop_df

Unnamed: 0,2000,2010
Goa,1212123,1625372512
Kerala,2198612,173728911
Tamilnadu,42718627,9172892718


In [67]:
#stack() -- oppsite tp unstack()
pop_df = pop_df.stack()
pop_df

Goa        2000       1212123
           2010    1625372512
Kerala     2000       2198612
           2010     173728911
Tamilnadu  2000      42718627
           2010    9172892718
dtype: int64

In [68]:
#Adding another colunm to multiindexed dataframe
pop_df = pd.DataFrame({'total':pop,
                      'under18':[12345,233423,
                                23412,742861,
                                12342,9328139]})
pop_df

Unnamed: 0,Unnamed: 1,total,under18
Kerala,2000,2198612,12345
Kerala,2010,173728911,233423
Tamilnadu,2000,42718627,23412
Tamilnadu,2010,9172892718,742861
Goa,2000,1212123,12342
Goa,2010,1625372512,9328139


In [69]:
#Methods of MultiIndex Creation : 
#Explicit MultiIndex constructors, MultiIndex Level names, MultiIndex for cloumns

In [70]:
#Simple creation of MultiIndex 
data = pd.DataFrame(np.random.rand(4,2),
                   index=[['a','a','b','b'],[1,2,1,2]],
                   columns=['data1','data2'])
data

Unnamed: 0,Unnamed: 1,data1,data2
a,1,0.233744,0.174777
a,2,0.306478,0.990979
b,1,0.740409,0.223061
b,2,0.811444,0.61528


In [71]:
#Use the class method constructor 
pd.MultiIndex.from_arrays([['a','a','b','b'],[1,2,1,2]])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           codes=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [72]:
pd.MultiIndex.from_tuples([('a',1),('a',2),('b',1),('b',2)])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           codes=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [73]:
pd.MultiIndex.from_product([['a','b'],[1,2]])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           codes=[[0, 0, 1, 1], [0, 1, 0, 1]])

In [74]:
#Explicit MultiIndex constructors
#construct a Mulitiindex using internal encoding - levels&labels
pd.MultiIndex(levels=[['a','b'],[1,2]],
             codes = [[0,1,0,1],[0,1,0,1]])

MultiIndex(levels=[['a', 'b'], [1, 2]],
           codes=[[0, 1, 0, 1], [0, 1, 0, 1]])

In [75]:
#MultiIndex Level names - specifying names to indexes.
pop.index.names = ['STATES','YEAR']
pop

STATES     YEAR
Kerala     2000       2198612
           2010     173728911
Tamilnadu  2000      42718627
           2010    9172892718
Goa        2000       1212123
           2010    1625372512
dtype: int64

In [76]:
#MultiIndex for cloumns 

index = pd.MultiIndex.from_product([['2000','2010'],[1,2]],
                                  names=['YEAR','VISITS'])

columns = pd.MultiIndex.from_product([['Stark','Thor','Hulk'],['HR','Temp']],
                                    names=['SUBJECT','TYPE'])

#mock some data
data = np.round(np.random.rand(4,6),1)
data[:,::2]*= 10
data+= 37

health_data = pd.DataFrame(data,index = index,columns=columns)
health_data

Unnamed: 0_level_0,SUBJECT,Stark,Stark,Thor,Thor,Hulk,Hulk
Unnamed: 0_level_1,TYPE,HR,Temp,HR,Temp,HR,Temp
YEAR,VISITS,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2000,1,41.0,37.6,40.0,37.1,42.0,37.6
2000,2,44.0,37.8,37.0,37.4,47.0,37.9
2010,1,42.0,37.2,41.0,37.5,46.0,37.4
2010,2,46.0,37.6,42.0,37.9,39.0,37.7


In [77]:
health_data['Thor']

Unnamed: 0_level_0,TYPE,HR,Temp
YEAR,VISITS,Unnamed: 2_level_1,Unnamed: 3_level_1
2000,1,40.0,37.1
2000,2,37.0,37.4
2010,1,41.0,37.5
2010,2,42.0,37.9


Indexing and Slicing a MultiIndex

In [78]:
#Multiple indexed Series 

In [79]:
pop

STATES     YEAR
Kerala     2000       2198612
           2010     173728911
Tamilnadu  2000      42718627
           2010    9172892718
Goa        2000       1212123
           2010    1625372512
dtype: int64

In [80]:
pop['Kerala',2010]

173728911

In [81]:
pop['Kerala']

YEAR
2000      2198612
2010    173728911
dtype: int64

In [82]:
#Partial slicing is available as long as Multiindex index is sorted.
pop[:,2000]


STATES
Kerala        2198612
Tamilnadu    42718627
Goa           1212123
dtype: int64

In [83]:
#Indexing and selection based on boolean masks
pop[pop> 2000000]

STATES     YEAR
Kerala     2000       2198612
           2010     173728911
Tamilnadu  2000      42718627
           2010    9172892718
Goa        2010    1625372512
dtype: int64

In [84]:
#Selection based on fancy indexing 
pop[['Kerala','Goa']]

STATES  YEAR
Kerala  2000       2198612
        2010     173728911
Goa     2000       1212123
        2010    1625372512
dtype: int64

In [85]:
#Multiple indexed DataFrames

In [86]:
health_data

Unnamed: 0_level_0,SUBJECT,Stark,Stark,Thor,Thor,Hulk,Hulk
Unnamed: 0_level_1,TYPE,HR,Temp,HR,Temp,HR,Temp
YEAR,VISITS,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2000,1,41.0,37.6,40.0,37.1,42.0,37.6
2000,2,44.0,37.8,37.0,37.4,47.0,37.9
2010,1,42.0,37.2,41.0,37.5,46.0,37.4
2010,2,46.0,37.6,42.0,37.9,39.0,37.7


In [87]:
health_data['Stark','HR']

YEAR  VISITS
2000  1         41.0
      2         44.0
2010  1         42.0
      2         46.0
Name: (Stark, HR), dtype: float64

In [88]:
health_data.iloc[:2,:2]

Unnamed: 0_level_0,SUBJECT,Stark,Stark
Unnamed: 0_level_1,TYPE,HR,Temp
YEAR,VISITS,Unnamed: 2_level_2,Unnamed: 3_level_2
2000,1,41.0,37.6
2000,2,44.0,37.8


In [89]:
idx= pd.IndexSlice
health_data.loc[idx[:,1],idx[:,'HR']]

Unnamed: 0_level_0,SUBJECT,Stark,Thor,Hulk
Unnamed: 0_level_1,TYPE,HR,HR,HR
YEAR,VISITS,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
2000,1,41.0,40.0,42.0
2010,1,42.0,41.0,46.0


In [90]:
#Rearranging Multi-Indices

In [91]:
#sorted and unsorted indices - Slicing operations will fail if the index is not sorted

In [92]:
index = pd.MultiIndex.from_product([['a','c','b'],[1,2]])
data = pd.Series(np.random.rand(6),index = index)
data.index.names= ['char','int']
data #unsorted 

char  int
a     1      0.042433
      2      0.053607
c     1      0.180310
      2      0.347467
b     1      0.625459
      2      0.621910
dtype: float64

In [93]:
try:
    data['a':'b'] #error when slicing a unsorted data
except KeyError as e:
    print(type(e))
    print(e)
    print('The data is not sorted')


<class 'pandas.errors.UnsortedIndexError'>
'Key length (1) was greater than MultiIndex lexsort depth (0)'
The data is not sorted


In [94]:
data = data.sort_index()
data #sorted

char  int
a     1      0.042433
      2      0.053607
b     1      0.625459
      2      0.621910
c     1      0.180310
      2      0.347467
dtype: float64

In [95]:
data['a':'b']

char  int
a     1      0.042433
      2      0.053607
b     1      0.625459
      2      0.621910
dtype: float64

In [96]:
#Index Setting and Resetting

#reset_index() --> Turns the index labels into columns
#set_index() --> columns to multi-index dataframe

In [97]:
pop #STATES & YEAR are the multi index

STATES     YEAR
Kerala     2000       2198612
           2010     173728911
Tamilnadu  2000      42718627
           2010    9172892718
Goa        2000       1212123
           2010    1625372512
dtype: int64

In [98]:
pop_flat = pop.reset_index(name='population')
pop_flat

Unnamed: 0,STATES,YEAR,population
0,Kerala,2000,2198612
1,Kerala,2010,173728911
2,Tamilnadu,2000,42718627
3,Tamilnadu,2010,9172892718
4,Goa,2000,1212123
5,Goa,2010,1625372512


In [99]:
pop_flat.set_index(['STATES','YEAR'])

Unnamed: 0_level_0,Unnamed: 1_level_0,population
STATES,YEAR,Unnamed: 2_level_1
Kerala,2000,2198612
Kerala,2010,173728911
Tamilnadu,2000,42718627
Tamilnadu,2010,9172892718
Goa,2000,1212123
Goa,2010,1625372512


In [100]:
#Data aggregation on Multi-Indices
#Pandas has built-in aggregation methods like sum(),max(),min() etc

In [101]:
health_data

Unnamed: 0_level_0,SUBJECT,Stark,Stark,Thor,Thor,Hulk,Hulk
Unnamed: 0_level_1,TYPE,HR,Temp,HR,Temp,HR,Temp
YEAR,VISITS,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2000,1,41.0,37.6,40.0,37.1,42.0,37.6
2000,2,44.0,37.8,37.0,37.4,47.0,37.9
2010,1,42.0,37.2,41.0,37.5,46.0,37.4
2010,2,46.0,37.6,42.0,37.9,39.0,37.7


In [102]:
data_mean = health_data.mean(level ='YEAR')
data_mean

SUBJECT,Stark,Stark,Thor,Thor,Hulk,Hulk
TYPE,HR,Temp,HR,Temp,HR,Temp
YEAR,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2000,42.5,37.7,38.5,37.25,44.5,37.75
2010,44.0,37.4,41.5,37.7,42.5,37.55


In [103]:
data_mean.mean(axis=1,level='TYPE') #Along axis=1 i.e columns

TYPE,HR,Temp
YEAR,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,41.833333,37.566667
2010,42.666667,37.55


Combinging Datasets: Concat and Append

In [105]:
#Function to create a dataset
def make_df(col,ind):
    data = {c:[str(c)+str(i) for i in ind] for c in col}
    return pd.DataFrame(data,ind)

In [106]:
#example dataframe
make_df('ABC',range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [107]:
#Using simple pd.concat 

In [118]:
ser1 = pd.Series(['A','B','C'],index =[1,2,3])
ser2 = pd.Series(['D','E','F'],index = [4,5,6])
print(ser1);print("\n");print(ser2)
print("\n")
print("Concatenated")
_ser = pd.concat([ser1,ser2]) #concat along rows
print(_ser)

1    A
2    B
3    C
dtype: object


4    D
5    E
6    F
dtype: object


Concatenated
1    A
2    B
3    C
4    D
5    E
6    F
dtype: object


In [112]:
pd.concat([ser1,ser2],axis = 1) #concat along columns

Unnamed: 0,0,1
1,A,
2,B,
3,C,
4,,D
5,,E
6,,F


In [127]:
df1 = make_df('AB',[1,2])
df2 = make_df('AB',[3,4])
print(df1);print("\n");print(df2)

_df = pd.concat([df1,df2],sort=False) #Sort comes with newer pd version ; mine is 24.0
print("\n")
print(_df)

    A   B
1  A1  B1
2  A2  B2


    A   B
3  A3  B3
4  A4  B4


    A   B
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4


In [130]:
df3 = make_df('AB',[0,1])
df4 = make_df('CD',[0,1])
print(df3);print("\n");print(df4)

_df = pd.concat([df3,df4],axis=1,sort=False) #Concate along columns
print("\n")
print(_df)

    A   B
0  A0  B0
1  A1  B1


    C   D
0  C0  D0
1  C1  D1


    A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1


In [131]:
#Duplicate indices

#pd.concat is that Pandas concatenation 'preserves indices' even if the result will have duplicate indices!


In [132]:
x = make_df('AB',[0,1])
y = make_df('AB',[2,3])
y.index = x.index #make duplicate indices!
print(x);print('\n');print(y);print('\n');print(pd.concat([x,y]))

    A   B
0  A0  B0
1  A1  B1


    A   B
0  A2  B2
1  A3  B3


    A   B
0  A0  B0
1  A1  B1
0  A2  B2
1  A3  B3


In [133]:
#Ways to handle duplicates in pd.concat
# 1) Catching the repeats as an error
# 2) Ignoring the index
# 3) Adding MultiIndex keys

In [134]:
#1) Catching the repeats as an error - verify_integrity flag set to True will raise an exception if there are duplicate indices

try:
    pd.concat([x,y],verify_integrity=True)
except ValueError as e:
    print('ValueError: ',e)

ValueError:  Indexes have overlapping values: Int64Index([0, 1], dtype='int64')


In [135]:
#2) Ignoring the index - ignore_index flag set to True will create a new integer index for the resulting Series,
print(x);print('\n');print(y);print('\n');print(pd.concat([x,y],ignore_index= True))


    A   B
0  A0  B0
1  A1  B1


    A   B
0  A2  B2
1  A3  B3


    A   B
0  A0  B0
1  A1  B1
2  A2  B2
3  A3  B3


In [136]:
#3) Adding MultiIndex keys - hierarchically indexed series or multiply indexed DataFrame
print(x);print('\n');print(y);print('\n');print(pd.concat([x,y],keys=['x','y']))

    A   B
0  A0  B0
1  A1  B1


    A   B
0  A2  B2
1  A3  B3


      A   B
x 0  A0  B0
  1  A1  B1
y 0  A2  B2
  1  A3  B3


In [137]:
#Concatenation with joins - join & join_axes

In [138]:
df5 = make_df('ABC',[1,2])
df6 = make_df('BCD',[3,4])
print(df5);print("\n");print(df6)

_df = pd.concat([df5,df6],sort=False)
print("\n")
print(_df)

    A   B   C
1  A1  B1  C1
2  A2  B2  C2


    B   C   D
3  B3  C3  D3
4  B4  C4  D4


     A   B   C    D
1   A1  B1  C1  NaN
2   A2  B2  C2  NaN
3  NaN  B3  C3   D3
4  NaN  B4  C4   D4


In [139]:
df5 = make_df('ABC',[1,2])
df6 = make_df('BCD',[3,4])
print(df5);print("\n");print(df6)

_df = pd.concat([df5,df6],join='inner',sort=False) #join inner ingores all the NaN values
print("\n")
print(_df)

    A   B   C
1  A1  B1  C1
2  A2  B2  C2


    B   C   D
3  B3  C3  D3
4  B4  C4  D4


    B   C
1  B1  C1
2  B2  C2
3  B3  C3
4  B4  C4


In [140]:
df5 = make_df('ABC',[1,2])
df6 = make_df('BCD',[3,4])
print(df5);print("\n");print(df6)

_df = pd.concat([df5,df6],join_axes=[df5.columns],sort=False)
print("\n")
print(_df)

    A   B   C
1  A1  B1  C1
2  A2  B2  C2


    B   C   D
3  B3  C3  D3
4  B4  C4  D4


     A   B   C
1   A1  B1  C1
2   A2  B2  C2
3  NaN  B3  C3
4  NaN  B4  C4


In [141]:
#The append() method

In [142]:
print(df1);print("\n");print(df2);print("\n");print(df1.append(df2))

    A   B
1  A1  B1
2  A2  B2


    A   B
3  A3  B3
4  A4  B4


    A   B
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4


Combining Datasets: MERGE AND JOIN

In [143]:
#Categories of joins - one-to-one, many-to-one,many-to-many joins

In [149]:
#one-to-one joins,

df1 = pd.DataFrame({'employee':['sai','saran','naveen','ranjith'],'group':['Developer','Designer','Marketing','Developer']})
df2 = pd.DataFrame({'employee':['saran','naveen','sai','ranjith'],'hire_date':[2008,2009,2010,2012]})
print(df1);print('\n');print(df2)

  employee      group
0      sai  Developer
1    saran   Designer
2   naveen  Marketing
3  ranjith  Developer


  employee  hire_date
0    saran       2008
1   naveen       2009
2      sai       2010
3  ranjith       2012


In [150]:
df3 = pd.merge(df1,df2) #the result of merge is a new dataframe. Merge in general discards the index, except in the special case of merges by index.
print(df3)

  employee      group  hire_date
0      sai  Developer       2010
1    saran   Designer       2008
2   naveen  Marketing       2009
3  ranjith  Developer       2012


In [151]:
#many-to-one joins

df4 = pd.DataFrame({'group':['Developer','Designer','Marketing'],'supervisor':['carly','guido','steve']})
print(df3);print('\n');print(df4)

  employee      group  hire_date
0      sai  Developer       2010
1    saran   Designer       2008
2   naveen  Marketing       2009
3  ranjith  Developer       2012


       group supervisor
0  Developer      carly
1   Designer      guido
2  Marketing      steve


In [152]:
print(pd.merge(df3,df4)) #The resulting dataframe will have additional column with the 'supervisor' information where the information is repeated in one or more locations as required by the inputs

  employee      group  hire_date supervisor
0      sai  Developer       2010      carly
1  ranjith  Developer       2012      carly
2    saran   Designer       2008      guido
3   naveen  Marketing       2009      steve


In [154]:
#Many-to-many joins - If the key in both left and right array contains duplicates, then the result is many-to-many merge.
df5 = pd.DataFrame({'group':['Developer','Developer','Designer','Designer','Marketing','Marketing'],'skills':['python','datascience','figma','photoshop','excel','spreadsheets']})
print(df1);print('\n');print(df5)

  employee      group
0      sai  Developer
1    saran   Designer
2   naveen  Marketing
3  ranjith  Developer


       group        skills
0  Developer        python
1  Developer   datascience
2   Designer         figma
3   Designer     photoshop
4  Marketing         excel
5  Marketing  spreadsheets


In [155]:
print(pd.merge(df1,df5))

  employee      group        skills
0      sai  Developer        python
1      sai  Developer   datascience
2  ranjith  Developer        python
3  ranjith  Developer   datascience
4    saran   Designer         figma
5    saran   Designer     photoshop
6   naveen  Marketing         excel
7   naveen  Marketing  spreadsheets
