In [1]:
import pandas as pd
import numpy as np

from IPython.display import display

## Data Structures in Pandas:
1. Series
2. DataFrame
3. Panel

### Series
1. Empty Series
2. Converting arrays, lists, dictionaries etc to series


#### 1. Empty Series

In [None]:
pd.Series([data, index, dtype])

In [2]:
s = pd.Series()
print(s, type(s))

Series([], dtype: float64) <class 'pandas.core.series.Series'>


  s = pd.Series()


#### 2. Converting arrays, lists, dictionaries etc to series

In [8]:
s = pd.Series([1.5,2,3,4], index=['a','b','c','d'], dtype = int)
print(s, type(s))

a    1.5
b    2.0
c    3.0
d    4.0
dtype: float64 <class 'pandas.core.series.Series'>


In [10]:
s = pd.Series([1,2,3,4], index=['a','b','c','d'], dtype = float)
print('Series from list ')
print(s, type(s))

s = pd.Series(np.array([1,2,3,4]))
print('\nSeries from numpy array ')
print(s, type(s))

d = {
    'a':1,
    'b':2,
    'c':3
}
s = pd.Series(d)
print('\nSeries from dictionary ')
print(s, type(s))

Series from list 
a    1.0
b    2.0
c    3.0
d    4.0
dtype: float64 <class 'pandas.core.series.Series'>

Series from numpy array 
0    1
1    2
2    3
3    4
dtype: int64 <class 'pandas.core.series.Series'>

Series from dictionary 
a    1
b    2
c    3
dtype: int64 <class 'pandas.core.series.Series'>


### DataFrame

In [None]:
#pd.DataFrame(data, index, columns, dtype)

In [None]:
df = pd.DataFrame()
print(df, type(df))

#### Creating a dataframe:
1. Array to dataframe
2. List to dataframe
3. Dictionary to dataframe
4. Series to dataframe

In [18]:
# s = pd.Series([1,2,3,4], index=['a','b','c','d'], dtype = float)
# s = pd.Series(np.array([1,2,3,4]))
d = {
    'a':1,
    'b':2,
    'c':3
}
s = pd.Series(d)
df = pd.DataFrame(s, columns= ['A'])
display(df)
print(type(df))

Unnamed: 0,A
a,1
b,2
c,3


<class 'pandas.core.frame.DataFrame'>


In [38]:
# arr = [['Andy', 25], ['John', 30], ['Robert', 35]]
# arr_1 = np.array([['Andy', 25], ['John', 30], ['Robert', 35]], dtype='str')
# print(arr_1, arr_1.shape)

# df = pd.DataFrame(arr, columns=['Name','Age'], index = ['P1', 'P2', 'P3'])
# display(df)

d = {
    'Show': ('BB', 'BCS','Narcos', 'GOT'),
    'Rating': [10,20,30,-40]
}
print(d)
df = pd.DataFrame(d)
display(df)

{'Show': ('BB', 'BCS', 'Narcos', 'GOT'), 'Rating': [10, 20, 30, -40]}


Unnamed: 0,Show,Rating
0,BB,10
1,BCS,20
2,Narcos,30
3,GOT,-40


### Merge, Join and Concatenate

#### 1.Merge

In [45]:
left_df = pd.DataFrame({
    'Alphabet':['a', 'b', 'c', 'd', 'e', 'f'],
    'Number_left':['1','2','3','4', '5', '6'],
    'Alpha_numeric_left':['a1', 'b1', 'c1', 'd1', 'e5', 'f6']
    })
display(left_df)
right_df = pd.DataFrame({
    'Alphabet':['a', 'b', 'c', 'd','g'],
    'Number_right':['5','6','7','8', '10'],
    'Alpha_numeric_right':['e5', 'f6', 'g7', 'h8', 'i10']
    }) 
display(right_df)

merged_df = pd.merge(left_df, right_df, on = 'Alphabet', how = 'inner')
print('Inner join')
display(merged_df)

merged_df = pd.merge(left_df, right_df, on = 'Alphabet', how = 'left')
print('\nLeft join')
display(merged_df)

merged_df = pd.merge(left_df, right_df, on = 'Alphabet', how = 'right')
print('\nRight join')
display(merged_df)

merged_df = pd.merge(left_df, right_df, on = 'Alphabet', how = 'outer')
print('\nOuter join')
display(merged_df)

Unnamed: 0,Alphabet,Number_left,Alpha_numeric_left
0,a,1,a1
1,b,2,b1
2,c,3,c1
3,d,4,d1
4,e,5,e5
5,f,6,f6


Unnamed: 0,Alphabet,Number_right,Alpha_numeric_right
0,a,5,e5
1,b,6,f6
2,c,7,g7
3,d,8,h8
4,g,10,i10


Inner join


Unnamed: 0,Alphabet,Number_left,Alpha_numeric_left,Number_right,Alpha_numeric_right
0,a,1,a1,5,e5
1,b,2,b1,6,f6
2,c,3,c1,7,g7
3,d,4,d1,8,h8



Left join


Unnamed: 0,Alphabet,Number_left,Alpha_numeric_left,Number_right,Alpha_numeric_right
0,a,1,a1,5.0,e5
1,b,2,b1,6.0,f6
2,c,3,c1,7.0,g7
3,d,4,d1,8.0,h8
4,e,5,e5,,
5,f,6,f6,,



Right join


Unnamed: 0,Alphabet,Number_left,Alpha_numeric_left,Number_right,Alpha_numeric_right
0,a,1.0,a1,5,e5
1,b,2.0,b1,6,f6
2,c,3.0,c1,7,g7
3,d,4.0,d1,8,h8
4,g,,,10,i10



Outer join


Unnamed: 0,Alphabet,Number_left,Alpha_numeric_left,Number_right,Alpha_numeric_right
0,a,1.0,a1,5.0,e5
1,b,2.0,b1,6.0,f6
2,c,3.0,c1,7.0,g7
3,d,4.0,d1,8.0,h8
4,e,5.0,e5,,
5,f,6.0,f6,,
6,g,,,10.0,i10


In [51]:
df1 = pd.DataFrame({'name':['Julie']*3, 'age':10+np.arange(3)}, index = ['a'+str(_) for _ in range(3)])
print('1st dataframe')
display(df1)
df2 = pd.DataFrame({'name':['John', 'Andy']*2, 'age':20+np.arange(4)}, index = ['b'+str(_) for _ in range(4)])
print('\n2nd dataframe')
display(df2)
df3 = pd.DataFrame({'name':['Williams']*5, 'age':30+np.arange(5)}, index = ['c'+str(_) for _ in range(5)])
print('\n3rd dataframe')
display(df3)
print('\nAfter concatenating')
display(pd.concat([df1, df2, df3], ignore_index = True, axis = 0))

1st dataframe


Unnamed: 0,name,age
a0,Julie,10
a1,Julie,11
a2,Julie,12



2nd dataframe


Unnamed: 0,name,age
b0,John,20
b1,Andy,21
b2,John,22
b3,Andy,23



3rd dataframe


Unnamed: 0,name,age
c0,Williams,30
c1,Williams,31
c2,Williams,32
c3,Williams,33
c4,Williams,34



After concatenating


Unnamed: 0,name,age
0,Julie,10
1,Julie,11
2,Julie,12
3,John,20
4,Andy,21
5,John,22
6,Andy,23
7,Williams,30
8,Williams,31
9,Williams,32


In [58]:
df1 = pd.DataFrame({'Key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})
print('1st dataframe')
display(df1)
df2 = pd.DataFrame({'Key': ['a', 'b', 'd'], 'data2': range(3)})
print('\n2nd dataframe')
display(df2)
print('\nAfter concatenating dataframe')
print('\nAfter concatenating')
display(pd.concat([df1, df2], axis = 0, ignore_index = False))

1st dataframe


Unnamed: 0,Key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6



2nd dataframe


Unnamed: 0,Key,data2
0,a,0
1,b,1
2,d,2



After concatenating dataframe

After concatenating


Unnamed: 0,Key,data1,data2
0,b,0.0,
1,b,1.0,
2,a,2.0,
3,c,3.0,
4,a,4.0,
5,a,5.0,
6,b,6.0,
0,a,,0.0
1,b,,1.0
2,d,,2.0


## Hands on with Real Life Data

In [62]:
df_cars = pd.read_csv('mtcars2.csv')
#display(df_cars)

### Getting idea about the data

In [69]:
#check the type
print(type(df_cars))

#view number of rows and columns in the dataframe
print(df_cars.shape)

#view only the first n records
# here n is 5 
display(df_cars.head(5))

#view only the last n records
# here n is 5 
display(df_cars.tail(5))

# random n many recors
# here n = 5
display(df_cars.sample(7))

<class 'pandas.core.frame.DataFrame'>
(32, 13)


Unnamed: 0,S.No,Unnamed: 1,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,1,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,2,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,3,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,4,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,5,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


Unnamed: 0,S.No,Unnamed: 1,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
27,28,Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2
28,29,Ford Pantera L,15.8,8,351.0,264,4.22,3.17,14.5,0,1,5,4
29,30,Ferrari Dino,19.7,6,145.0,175,3.62,2.77,15.5,0,1,5,6
30,31,Maserati Bora,15.0,8,301.0,335,3.54,3.57,14.6,0,1,5,8
31,32,Volvo 142E,21.4,4,121.0,109,4.11,2.78,18.6,1,1,4,2


Unnamed: 0,S.No,Unnamed: 1,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
6,7,Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
21,22,Dodge Challenger,15.5,8,318.0,150,2.76,3.52,16.87,0,0,3,2
25,26,Fiat X1-9,27.3,4,79.0,66,4.08,1.935,,1,1,4,1
12,13,Merc 450SL,17.3,8,275.8,180,3.07,3.73,17.6,0,0,3,3
8,9,Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
30,31,Maserati Bora,15.0,8,301.0,335,3.54,3.57,14.6,0,1,5,8
26,27,Porsche 914-2,26.0,4,120.3,91,4.43,2.14,16.7,0,1,5,2


In [65]:
# print a concise summary of the columns
display(df_cars.info(null_counts = True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   S.No        32 non-null     int64  
 1   Unnamed: 1  32 non-null     object 
 2   mpg         32 non-null     float64
 3   cyl         32 non-null     int64  
 4   disp        32 non-null     float64
 5   hp          32 non-null     int64  
 6   drat        32 non-null     float64
 7   wt          32 non-null     float64
 8   qsec        29 non-null     float64
 9   vs          32 non-null     int64  
 10  am          32 non-null     int64  
 11  gear        32 non-null     int64  
 12  carb        32 non-null     int64  
dtypes: float64(5), int64(7), object(1)
memory usage: 3.4+ KB


  df_cars.info(null_counts = True)


In [68]:
## what columns are there
print(df_cars.columns)
type(df_cars.columns)

Index(['S.No', 'Unnamed: 1', 'mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec',
       'vs', 'am', 'gear', 'carb'],
      dtype='object')


pandas.core.indexes.base.Index

In [71]:
## selecting a dataframe with asubset of teh columns
subset_col_list = ['S.No', 'mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec',
       'vs', 'am', 'gear', 'carb']
df_cars_sub = df_cars[subset_col_list]
df_cars_sub.shape

(32, 12)