## Pandas

In [1]:
import numpy as np
import pandas as pd

### You can create a series object with a list or numpy array or a dictionary

### Series with List

In [2]:
my_list = [i for i in range(0,5)]
my_index =[i for i in 'risha']

In [3]:
my_list, my_index

([0, 1, 2, 3, 4], ['r', 'i', 's', 'h', 'a'])

In [4]:
pd.Series(data=my_list, index=my_index)

r    0
i    1
s    2
h    3
a    4
dtype: int64

In [5]:
pd.Series(my_list)

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [6]:
pd.Series(my_list,my_index)

r    0
i    1
s    2
h    3
a    4
dtype: int64

### Series with NumPy array

In [7]:
array = np.arange(0,5)

In [8]:
array, my_index

(array([0, 1, 2, 3, 4]), ['r', 'i', 's', 'h', 'a'])

In [9]:
pd.Series(data = array)

0    0
1    1
2    2
3    3
4    4
dtype: int32

In [10]:
pd.Series(data = array, index = my_index)

r    0
i    1
s    2
h    3
a    4
dtype: int32

### Series with Dictionary

In [11]:
dict1 = {'one':1,'two':2,'three':3,'four':4}

In [12]:
dict1

{'one': 1, 'two': 2, 'three': 3, 'four': 4}

In [13]:
ser1 = pd.Series(dict1)
ser1

one      1
two      2
three    3
four     4
dtype: int64

## Series can hold wide variety of datatypes

In [14]:
## strings
list1 = ['a','b','c','d']
list2 = [1,2,3,4]
ser2 = pd.Series(data= list1 , index= list2)
ser2

1    a
2    b
3    c
4    d
dtype: object

In [15]:
## holding functions
list1 = [sum,min,max,type,range]
list2 = ['sum','min','max','type','range']
ser3 = pd.Series(data=list1, index=list2)
ser3

sum      <built-in function sum>
min      <built-in function min>
max      <built-in function max>
type              <class 'type'>
range            <class 'range'>
dtype: object

## Grabbing data from series

In [16]:
ser2[1]

'a'

In [17]:
dict1 = {'Mumbai':100,'Delhi':200,'Kolkata':300,'Chennai':400,'Bengaluru':500}
ser1 = pd.Series(dict1)
ser2 = pd.Series({'Mumbai':100,'Kolkata':300,'Chennai':400,'Bengaluru':500})
ser3 = pd.Series({'Mumbai':100,'Delhi':200,'Kolkata':300,'Bengaluru':500})

In [18]:
ser1['Mumbai']

100

In [19]:
ser1

Mumbai       100
Delhi        200
Kolkata      300
Chennai      400
Bengaluru    500
dtype: int64

In [20]:
ser2

Mumbai       100
Kolkata      300
Chennai      400
Bengaluru    500
dtype: int64

In [21]:
ser3

Mumbai       100
Delhi        200
Kolkata      300
Bengaluru    500
dtype: int64

### Basic operations on Series

In [22]:
ser1 +  ser2

Bengaluru    1000.0
Chennai       800.0
Delhi           NaN
Kolkata       600.0
Mumbai        200.0
dtype: float64

In [23]:
ser2 + ser3

Bengaluru    1000.0
Chennai         NaN
Delhi           NaN
Kolkata       600.0
Mumbai        200.0
dtype: float64

## Basic functions and attributes

In [24]:
ser = ser2 + ser3
ser

Bengaluru    1000.0
Chennai         NaN
Delhi           NaN
Kolkata       600.0
Mumbai        200.0
dtype: float64

In [25]:
ser.isnull()

Bengaluru    False
Chennai       True
Delhi         True
Kolkata      False
Mumbai       False
dtype: bool

In [26]:
ser.notnull()

Bengaluru     True
Chennai      False
Delhi        False
Kolkata       True
Mumbai        True
dtype: bool

In [27]:
ser.size

5

In [28]:
ser.axes

[Index(['Bengaluru', 'Chennai', 'Delhi', 'Kolkata', 'Mumbai'], dtype='object')]

In [29]:
ser.values

array([1000.,   nan,   nan,  600.,  200.])

In [30]:
ser.empty

False

In [31]:
ser.max()

1000.0

In [32]:
ser.min()

200.0

In [33]:
ser.std()

400.0

In [34]:
ser.var()

160000.0

In [35]:
ser.sum()

1800.0

In [36]:
ser.mean()

600.0

In [37]:
ser.head(2)

Bengaluru    1000.0
Chennai         NaN
dtype: float64

In [38]:
ser.tail(3)

Delhi        NaN
Kolkata    600.0
Mumbai     200.0
dtype: float64

## Dataframes

### Creating a dataframe

In [39]:
rows = 'India Australia Japan US Russia England France Germany Bhutan Fiji'.split()
cols = 'c1 c2 c3 c4 c5 c6 c7 c8 c9 c10'.split()
data = np.random.randint(0,100,100).reshape(10,10)

In [40]:
rows

['India',
 'Australia',
 'Japan',
 'US',
 'Russia',
 'England',
 'France',
 'Germany',
 'Bhutan',
 'Fiji']

In [41]:
cols

['c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10']

In [42]:
data

array([[26, 34, 72,  3, 59, 52, 31, 20, 80, 50],
       [69, 77, 14,  8, 92, 22, 41,  9,  9, 35],
       [75, 83, 10, 81, 20, 12, 51, 69, 15, 19],
       [39, 29, 81, 82, 99,  3, 68, 42, 18, 72],
       [75, 71, 16, 78, 19, 70, 83, 51, 84, 94],
       [27,  0, 33, 60,  2,  8, 55, 33, 39, 42],
       [60, 38, 60, 85, 88, 18, 76, 55,  6, 36],
       [61, 12, 28, 45, 19, 17, 52,  8, 29, 64],
       [72, 25, 78, 96,  8, 16, 86, 35, 89, 22],
       [12, 90, 44, 50, 10, 82, 53, 55, 36, 87]])

In [43]:
df = pd.DataFrame(data=data, index=rows, columns=cols)

In [44]:
df

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10
India,26,34,72,3,59,52,31,20,80,50
Australia,69,77,14,8,92,22,41,9,9,35
Japan,75,83,10,81,20,12,51,69,15,19
US,39,29,81,82,99,3,68,42,18,72
Russia,75,71,16,78,19,70,83,51,84,94
England,27,0,33,60,2,8,55,33,39,42
France,60,38,60,85,88,18,76,55,6,36
Germany,61,12,28,45,19,17,52,8,29,64
Bhutan,72,25,78,96,8,16,86,35,89,22
Fiji,12,90,44,50,10,82,53,55,36,87


 ## Getting data out of DataFrames

### Selecting columns

In [45]:
df['c1']

India        26
Australia    69
Japan        75
US           39
Russia       75
England      27
France       60
Germany      61
Bhutan       72
Fiji         12
Name: c1, dtype: int32

In [46]:
df[['c1','c2']]

Unnamed: 0,c1,c2
India,26,34
Australia,69,77
Japan,75,83
US,39,29
Russia,75,71
England,27,0
France,60,38
Germany,61,12
Bhutan,72,25
Fiji,12,90


### Selecting rows

In [47]:
df.loc['India']

c1     26
c2     34
c3     72
c4      3
c5     59
c6     52
c7     31
c8     20
c9     80
c10    50
Name: India, dtype: int32

In [48]:
df.iloc[3]

c1     39
c2     29
c3     81
c4     82
c5     99
c6      3
c7     68
c8     42
c9     18
c10    72
Name: US, dtype: int32

In [49]:
df.loc[['India','England','Russia']]

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10
India,26,34,72,3,59,52,31,20,80,50
England,27,0,33,60,2,8,55,33,39,42
Russia,75,71,16,78,19,70,83,51,84,94


### Selecting subset of a dataframe

In [50]:
df

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10
India,26,34,72,3,59,52,31,20,80,50
Australia,69,77,14,8,92,22,41,9,9,35
Japan,75,83,10,81,20,12,51,69,15,19
US,39,29,81,82,99,3,68,42,18,72
Russia,75,71,16,78,19,70,83,51,84,94
England,27,0,33,60,2,8,55,33,39,42
France,60,38,60,85,88,18,76,55,6,36
Germany,61,12,28,45,19,17,52,8,29,64
Bhutan,72,25,78,96,8,16,86,35,89,22
Fiji,12,90,44,50,10,82,53,55,36,87


In [51]:
df.loc[['Japan','US','Russia'],['c5','c6','c7']]

Unnamed: 0,c5,c6,c7
Japan,20,12,51
US,99,3,68
Russia,19,70,83


### Adding a new column

In [52]:
df['c11'] = np.arange(0,10)

In [53]:
df

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
India,26,34,72,3,59,52,31,20,80,50,0
Australia,69,77,14,8,92,22,41,9,9,35,1
Japan,75,83,10,81,20,12,51,69,15,19,2
US,39,29,81,82,99,3,68,42,18,72,3
Russia,75,71,16,78,19,70,83,51,84,94,4
England,27,0,33,60,2,8,55,33,39,42,5
France,60,38,60,85,88,18,76,55,6,36,6
Germany,61,12,28,45,19,17,52,8,29,64,7
Bhutan,72,25,78,96,8,16,86,35,89,22,8
Fiji,12,90,44,50,10,82,53,55,36,87,9


### Adding a new row

In [54]:
df.loc['Poland'] = np.random.randint(0,100,11)

In [55]:
df

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
India,26,34,72,3,59,52,31,20,80,50,0
Australia,69,77,14,8,92,22,41,9,9,35,1
Japan,75,83,10,81,20,12,51,69,15,19,2
US,39,29,81,82,99,3,68,42,18,72,3
Russia,75,71,16,78,19,70,83,51,84,94,4
England,27,0,33,60,2,8,55,33,39,42,5
France,60,38,60,85,88,18,76,55,6,36,6
Germany,61,12,28,45,19,17,52,8,29,64,7
Bhutan,72,25,78,96,8,16,86,35,89,22,8
Fiji,12,90,44,50,10,82,53,55,36,87,9


### Dropping a column

In [56]:
df.drop('c11', axis=1)

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10
India,26,34,72,3,59,52,31,20,80,50
Australia,69,77,14,8,92,22,41,9,9,35
Japan,75,83,10,81,20,12,51,69,15,19
US,39,29,81,82,99,3,68,42,18,72
Russia,75,71,16,78,19,70,83,51,84,94
England,27,0,33,60,2,8,55,33,39,42
France,60,38,60,85,88,18,76,55,6,36
Germany,61,12,28,45,19,17,52,8,29,64
Bhutan,72,25,78,96,8,16,86,35,89,22
Fiji,12,90,44,50,10,82,53,55,36,87


### Dropping a row

In [57]:
df.drop('Poland',axis=0)

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
India,26,34,72,3,59,52,31,20,80,50,0
Australia,69,77,14,8,92,22,41,9,9,35,1
Japan,75,83,10,81,20,12,51,69,15,19,2
US,39,29,81,82,99,3,68,42,18,72,3
Russia,75,71,16,78,19,70,83,51,84,94,4
England,27,0,33,60,2,8,55,33,39,42,5
France,60,38,60,85,88,18,76,55,6,36,6
Germany,61,12,28,45,19,17,52,8,29,64,7
Bhutan,72,25,78,96,8,16,86,35,89,22,8
Fiji,12,90,44,50,10,82,53,55,36,87,9


## Conditional Selection

In [58]:
df >=50

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
India,False,False,True,False,True,True,False,False,True,True,False
Australia,True,True,False,False,True,False,False,False,False,False,False
Japan,True,True,False,True,False,False,True,True,False,False,False
US,False,False,True,True,True,False,True,False,False,True,False
Russia,True,True,False,True,False,True,True,True,True,True,False
England,False,False,False,True,False,False,True,False,False,False,False
France,True,False,True,True,True,False,True,True,False,False,False
Germany,True,False,False,False,False,False,True,False,False,True,False
Bhutan,True,False,True,True,False,False,True,False,True,False,False
Fiji,False,True,False,True,False,True,True,True,False,True,False


In [59]:
df[df>=50]

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
India,,,72.0,,59.0,52.0,,,80.0,50.0,
Australia,69.0,77.0,,,92.0,,,,,,
Japan,75.0,83.0,,81.0,,,51.0,69.0,,,
US,,,81.0,82.0,99.0,,68.0,,,72.0,
Russia,75.0,71.0,,78.0,,70.0,83.0,51.0,84.0,94.0,
England,,,,60.0,,,55.0,,,,
France,60.0,,60.0,85.0,88.0,,76.0,55.0,,,
Germany,61.0,,,,,,52.0,,,64.0,
Bhutan,72.0,,78.0,96.0,,,86.0,,89.0,,
Fiji,,90.0,,50.0,,82.0,53.0,55.0,,87.0,


In [60]:
df[df['c1']>=50]

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
Australia,69,77,14,8,92,22,41,9,9,35,1
Japan,75,83,10,81,20,12,51,69,15,19,2
Russia,75,71,16,78,19,70,83,51,84,94,4
France,60,38,60,85,88,18,76,55,6,36,6
Germany,61,12,28,45,19,17,52,8,29,64,7
Bhutan,72,25,78,96,8,16,86,35,89,22,8


In [61]:
df[df['c1']>=50].loc[['India','Bhutan']]

KeyError: 'Passing list-likes to .loc or [] with any missing labels is no longer supported, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike'

In [None]:
 df[(df>50) | (df%20==0)]

### Useful Methods

In [None]:
### RESET INDEX
df.reset_index()

In [None]:
df['new_ind'] = 'a b c d e f g h i j k'.split()

In [None]:
df

In [None]:
df.set_index('new_ind',inplace=True)

In [None]:
df

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe()

## Hierarichal Indexing

In [None]:
index = [['Asia','Asia','Asia','Europe','Europe','Europe','Africa','Africa','Africa'],
         ['Population','GDP','Gini','Population','GDP','Gini','Population','GDP','Gini'],
         [2010,2011,2012,2010,2011,2012,2010,2011,2012]]
data = np.random.randint(0,10,27).reshape(9,3)
cols = ['UN','World Bank','IMF']

In [None]:
df = pd.DataFrame(data=data,index=index,columns=cols)

In [None]:
df

In [None]:
df.index.names

In [None]:
df.index.names = ['Region','Parameter','Year']

In [None]:
df

In [None]:
df.loc['Asia'].loc[['GDP','Gini'],['UN','World Bank']]

In [None]:
df['UN']

In [None]:
df.xs(('Asia','GDP'))

In [None]:
df.xs(key=('GDP',2011),level=['Parameter','Year'])

## Handling missing data

In [None]:
## Creating a dataframe with a dictionary
df = pd.DataFrame({'India':[0,np.nan,2,3,4], 'China':[10,np.nan,12,13,np.nan], 'US':[np.nan,np.nan,np.nan,np.nan,np.nan],'Canada':[20,21,22,23,24]})

In [None]:
df

In [None]:
df.isnull()

In [None]:
df.notnull()

In [None]:
df.dropna()

In [None]:
df.dropna(axis=1)

In [None]:
df.dropna(thresh=2)

In [None]:
df.fillna(20)

In [None]:
df.fillna(method='ffill')

In [None]:
df.fillna(method='bfill')

In [None]:
df.fillna(method='pad')

## Merging and Concatenating

### Merging

In [None]:
df1 = pd.DataFrame({'key':'a b c d e'.split(), 'X1':[1,2,3,4,5], 'X2':range(6,11)})
df2 = pd.DataFrame({'key':'a b c f g'.split(), 'X1':[1,2,3,11,12], 'X2':range(4,9)})

In [None]:
df1

In [None]:
df2

In [None]:
pd.merge(df1,df2,how='inner',on='key')

In [None]:
pd.merge(df1,df2,how='left',on='key')

In [None]:
pd.merge(df1,df2,how='right',on='key')

In [None]:
pd.merge(df1,df2,how='outer',on='key')

In [None]:
left = pd.DataFrame({'key1': ['a', 'a', 'b', 'c'],
                     'key2': ['a', 'b', 'a', 'b'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
    
right = pd.DataFrame({'key1': ['a', 'b', 'b', 'c'],
                      'key2': ['a', 'b', 'a', 'a'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})

In [None]:
pd.merge(left,right,how='inner',on=['key1','key2'])

In [None]:
pd.merge(left,right,how='right',on=['key1','key2'])

In [None]:
pd.merge(left,right,how='left',on=['key1','key2'])

In [None]:
pd.merge(left,right,how='outer',on=['key1','key2'])

### Concatenation

In [None]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},
                        index=[0, 1, 2, 3])

In [None]:
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                        'B': ['B4', 'B5', 'B6', 'B7'],
                        'C': ['C4', 'C5', 'C6', 'C7'],
                        'D': ['D4', 'D5', 'D6', 'D7']},
                         index=[4,5,6,7]) 

In [None]:
df1

In [None]:
df2

In [None]:
pd.concat([df1,df2],axis=1)

In [None]:
pd.concat([df1,df2],axis=0)

## Group By

In [None]:
# Create a dataframe
data = {'Store':['Walmart','Walmart','Costco','Costco','Target','Target'],
       'Customer':['Tim','Jermy','Mark','Denice','Ray','Sam'],
       'Sales':[150,200,550,90,430,120]}
df = pd.DataFrame(data)
df

In [None]:
gb = df.groupby('Store')

In [None]:
gb

In [None]:
gb.sum()

In [None]:
gb.min()

In [None]:
gb.max()

In [None]:
gb.std()

In [None]:
gb.var()

In [None]:
gb.count()

In [None]:
gb.mean()

In [None]:
gb.describe()

In [None]:
gb.describe().transpose()

In [None]:
gb.describe().transpose()['Target']

## Useful methods and operations

In [62]:
data_dic = {'col_1':[1,2,3,4,5],
           'col_2':[111,222,333,111,555],
           'col_3':['alpha','bravo','charlie',np.nan,np.nan],
           }

In [63]:
df = pd.DataFrame(data_dic)

In [64]:
df

Unnamed: 0,col_1,col_2,col_3
0,1,111,alpha
1,2,222,bravo
2,3,333,charlie
3,4,111,
4,5,555,


In [66]:
## dropna
print(df.dropna(axis=0))
print(df.dropna(axis=1))

   col_1  col_2    col_3
0      1    111    alpha
1      2    222    bravo
2      3    333  charlie
   col_1  col_2
0      1    111
1      2    222
2      3    333
3      4    111
4      5    555


In [67]:
## fillna
print(df.fillna('Rishabh'))

   col_1  col_2    col_3
0      1    111    alpha
1      2    222    bravo
2      3    333  charlie
3      4    111  Rishabh
4      5    555  Rishabh


In [68]:
print(df.fillna(method='ffill'))

   col_1  col_2    col_3
0      1    111    alpha
1      2    222    bravo
2      3    333  charlie
3      4    111  charlie
4      5    555  charlie


In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   col_1   5 non-null      int64 
 1   col_2   5 non-null      int64 
 2   col_3   3 non-null      object
dtypes: int64(2), object(1)
memory usage: 248.0+ bytes


In [71]:
df.head(2)

Unnamed: 0,col_1,col_2,col_3
0,1,111,alpha
1,2,222,bravo


In [72]:
df.tail(2)

Unnamed: 0,col_1,col_2,col_3
3,4,111,
4,5,555,


In [73]:
df.isnull()

Unnamed: 0,col_1,col_2,col_3
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,True
4,False,False,True


In [74]:
df.notnull()

Unnamed: 0,col_1,col_2,col_3
0,True,True,True
1,True,True,True
2,True,True,True
3,True,True,False
4,True,True,False


In [79]:
##Unique
print(df['col_1'].unique())
print(df['col_2'].unique())
print(df['col_3'].unique())

[1 2 3 4 5]
[111 222 333 555]
['alpha' 'bravo' 'charlie' nan]


In [81]:
##nunique
print(df['col_1'].nunique())
print(df['col_2'].nunique())
print(df['col_3'].nunique())

5
4
3


In [82]:
##value_counts()
print(df['col_1'].value_counts())
print(df['col_2'].value_counts())
print(df['col_3'].value_counts())

5    1
4    1
3    1
2    1
1    1
Name: col_1, dtype: int64
111    2
222    1
333    1
555    1
Name: col_2, dtype: int64
bravo      1
charlie    1
alpha      1
Name: col_3, dtype: int64
