## Pandas

In [1]:
import numpy as np
import pandas as pd

### You can create a series object with a list or numpy array or a dictionary

### Series with List

In [2]:
my_list = [i for i in range(0,5)]
my_index =[i for i in 'risha']

In [3]:
my_list, my_index

([0, 1, 2, 3, 4], ['r', 'i', 's', 'h', 'a'])

In [4]:
pd.Series(data=my_list, index=my_index)

r    0
i    1
s    2
h    3
a    4
dtype: int64

In [5]:
pd.Series(my_list)

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [6]:
pd.Series(my_list,my_index)

r    0
i    1
s    2
h    3
a    4
dtype: int64

### Series with NumPy array

In [7]:
array = np.arange(0,5)

In [8]:
array, my_index

(array([0, 1, 2, 3, 4]), ['r', 'i', 's', 'h', 'a'])

In [9]:
pd.Series(data = array)

0    0
1    1
2    2
3    3
4    4
dtype: int32

In [10]:
pd.Series(data = array, index = my_index)

r    0
i    1
s    2
h    3
a    4
dtype: int32

### Series with Dictionary

In [11]:
dict1 = {'one':1,'two':2,'three':3,'four':4}

In [12]:
dict1

{'one': 1, 'two': 2, 'three': 3, 'four': 4}

In [13]:
ser1 = pd.Series(dict1)
ser1

one      1
two      2
three    3
four     4
dtype: int64

## Series can hold wide variety of datatypes

In [14]:
## strings
list1 = ['a','b','c','d']
list2 = [1,2,3,4]
ser2 = pd.Series(data= list1 , index= list2)
ser2

1    a
2    b
3    c
4    d
dtype: object

In [15]:
## holding functions
list1 = [sum,min,max,type,range]
list2 = ['sum','min','max','type','range']
ser3 = pd.Series(data=list1, index=list2)
ser3

sum      <built-in function sum>
min      <built-in function min>
max      <built-in function max>
type              <class 'type'>
range            <class 'range'>
dtype: object

## Grabbing data from series

In [16]:
ser2[1]

'a'

In [17]:
dict1 = {'Mumbai':100,'Delhi':200,'Kolkata':300,'Chennai':400,'Bengaluru':500}
ser1 = pd.Series(dict1)
ser2 = pd.Series({'Mumbai':100,'Kolkata':300,'Chennai':400,'Bengaluru':500})
ser3 = pd.Series({'Mumbai':100,'Delhi':200,'Kolkata':300,'Bengaluru':500})

In [18]:
ser1['Mumbai']

100

In [19]:
ser1

Mumbai       100
Delhi        200
Kolkata      300
Chennai      400
Bengaluru    500
dtype: int64

In [20]:
ser2

Mumbai       100
Kolkata      300
Chennai      400
Bengaluru    500
dtype: int64

In [21]:
ser3

Mumbai       100
Delhi        200
Kolkata      300
Bengaluru    500
dtype: int64

### Basic operations on Series

In [22]:
ser1 +  ser2

Bengaluru    1000.0
Chennai       800.0
Delhi           NaN
Kolkata       600.0
Mumbai        200.0
dtype: float64

In [23]:
ser2 + ser3

Bengaluru    1000.0
Chennai         NaN
Delhi           NaN
Kolkata       600.0
Mumbai        200.0
dtype: float64

## Basic functions and attributes

In [24]:
ser = ser2 + ser3
ser

Bengaluru    1000.0
Chennai         NaN
Delhi           NaN
Kolkata       600.0
Mumbai        200.0
dtype: float64

In [25]:
ser.isnull()

Bengaluru    False
Chennai       True
Delhi         True
Kolkata      False
Mumbai       False
dtype: bool

In [26]:
ser.notnull()

Bengaluru     True
Chennai      False
Delhi        False
Kolkata       True
Mumbai        True
dtype: bool

In [27]:
ser.size

5

In [28]:
ser.axes

[Index(['Bengaluru', 'Chennai', 'Delhi', 'Kolkata', 'Mumbai'], dtype='object')]

In [29]:
ser.values

array([1000.,   nan,   nan,  600.,  200.])

In [30]:
ser.empty

False

In [31]:
ser.max()

1000.0

In [32]:
ser.min()

200.0

In [33]:
ser.std()

400.0

In [34]:
ser.var()

160000.0

In [35]:
ser.sum()

1800.0

In [36]:
ser.mean()

600.0

In [37]:
ser.head(2)

Bengaluru    1000.0
Chennai         NaN
dtype: float64

In [38]:
ser.tail(3)

Delhi        NaN
Kolkata    600.0
Mumbai     200.0
dtype: float64

## Dataframes

### Creating a dataframe

In [39]:
rows = 'India Australia Japan US Russia England France Germany Bhutan Fiji'.split()
cols = 'c1 c2 c3 c4 c5 c6 c7 c8 c9 c10'.split()
data = np.random.randint(0,100,100).reshape(10,10)

In [40]:
rows

['India',
 'Australia',
 'Japan',
 'US',
 'Russia',
 'England',
 'France',
 'Germany',
 'Bhutan',
 'Fiji']

In [41]:
cols

['c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10']

In [42]:
data

array([[95, 13, 48, 58, 82, 87, 24, 87, 62, 85],
       [87, 86, 37, 36, 26, 23, 26, 64, 69,  6],
       [45, 42, 83, 72,  8, 47, 44, 65,  3, 60],
       [93, 18, 78, 83, 75, 15, 38, 84, 51, 58],
       [37,  6, 48, 83, 38, 93, 96, 91,  6, 26],
       [23, 85, 33, 69, 52,  6, 40, 62, 81,  8],
       [12, 23, 35, 68, 53, 41, 11, 40, 28, 76],
       [34, 71, 68, 98, 67, 53, 52,  9, 59, 92],
       [38, 96, 77, 37, 80,  7, 69, 34, 71, 23],
       [16, 42, 96, 26, 29, 59,  4, 67, 13, 65]])

In [43]:
df = pd.DataFrame(data=data, index=rows, columns=cols)

In [44]:
df

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10
India,95,13,48,58,82,87,24,87,62,85
Australia,87,86,37,36,26,23,26,64,69,6
Japan,45,42,83,72,8,47,44,65,3,60
US,93,18,78,83,75,15,38,84,51,58
Russia,37,6,48,83,38,93,96,91,6,26
England,23,85,33,69,52,6,40,62,81,8
France,12,23,35,68,53,41,11,40,28,76
Germany,34,71,68,98,67,53,52,9,59,92
Bhutan,38,96,77,37,80,7,69,34,71,23
Fiji,16,42,96,26,29,59,4,67,13,65


 ## Getting data out of DataFrames

### Selecting columns

In [45]:
df['c1']

India        95
Australia    87
Japan        45
US           93
Russia       37
England      23
France       12
Germany      34
Bhutan       38
Fiji         16
Name: c1, dtype: int32

In [46]:
df[['c1','c2']]

Unnamed: 0,c1,c2
India,95,13
Australia,87,86
Japan,45,42
US,93,18
Russia,37,6
England,23,85
France,12,23
Germany,34,71
Bhutan,38,96
Fiji,16,42


### Selecting rows

In [47]:
df.loc['India']

c1     95
c2     13
c3     48
c4     58
c5     82
c6     87
c7     24
c8     87
c9     62
c10    85
Name: India, dtype: int32

In [48]:
df.iloc[3]

c1     93
c2     18
c3     78
c4     83
c5     75
c6     15
c7     38
c8     84
c9     51
c10    58
Name: US, dtype: int32

In [49]:
df.loc[['India','England','Russia']]

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10
India,95,13,48,58,82,87,24,87,62,85
England,23,85,33,69,52,6,40,62,81,8
Russia,37,6,48,83,38,93,96,91,6,26


### Selecting subset of a dataframe

In [50]:
df

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10
India,95,13,48,58,82,87,24,87,62,85
Australia,87,86,37,36,26,23,26,64,69,6
Japan,45,42,83,72,8,47,44,65,3,60
US,93,18,78,83,75,15,38,84,51,58
Russia,37,6,48,83,38,93,96,91,6,26
England,23,85,33,69,52,6,40,62,81,8
France,12,23,35,68,53,41,11,40,28,76
Germany,34,71,68,98,67,53,52,9,59,92
Bhutan,38,96,77,37,80,7,69,34,71,23
Fiji,16,42,96,26,29,59,4,67,13,65


In [51]:
df.loc[['Japan','US','Russia'],['c5','c6','c7']]

Unnamed: 0,c5,c6,c7
Japan,8,47,44
US,75,15,38
Russia,38,93,96


### Adding a new column

In [52]:
df['c11'] = np.arange(0,10)

In [53]:
df

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
India,95,13,48,58,82,87,24,87,62,85,0
Australia,87,86,37,36,26,23,26,64,69,6,1
Japan,45,42,83,72,8,47,44,65,3,60,2
US,93,18,78,83,75,15,38,84,51,58,3
Russia,37,6,48,83,38,93,96,91,6,26,4
England,23,85,33,69,52,6,40,62,81,8,5
France,12,23,35,68,53,41,11,40,28,76,6
Germany,34,71,68,98,67,53,52,9,59,92,7
Bhutan,38,96,77,37,80,7,69,34,71,23,8
Fiji,16,42,96,26,29,59,4,67,13,65,9


### Adding a new row

In [54]:
df.loc['Poland'] = np.random.randint(0,100,11)

In [55]:
df

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
India,95,13,48,58,82,87,24,87,62,85,0
Australia,87,86,37,36,26,23,26,64,69,6,1
Japan,45,42,83,72,8,47,44,65,3,60,2
US,93,18,78,83,75,15,38,84,51,58,3
Russia,37,6,48,83,38,93,96,91,6,26,4
England,23,85,33,69,52,6,40,62,81,8,5
France,12,23,35,68,53,41,11,40,28,76,6
Germany,34,71,68,98,67,53,52,9,59,92,7
Bhutan,38,96,77,37,80,7,69,34,71,23,8
Fiji,16,42,96,26,29,59,4,67,13,65,9


### Dropping a column

In [56]:
df.drop('c11', axis=1)

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10
India,95,13,48,58,82,87,24,87,62,85
Australia,87,86,37,36,26,23,26,64,69,6
Japan,45,42,83,72,8,47,44,65,3,60
US,93,18,78,83,75,15,38,84,51,58
Russia,37,6,48,83,38,93,96,91,6,26
England,23,85,33,69,52,6,40,62,81,8
France,12,23,35,68,53,41,11,40,28,76
Germany,34,71,68,98,67,53,52,9,59,92
Bhutan,38,96,77,37,80,7,69,34,71,23
Fiji,16,42,96,26,29,59,4,67,13,65


### Dropping a row

In [57]:
df.drop('Poland',axis=0)

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
India,95,13,48,58,82,87,24,87,62,85,0
Australia,87,86,37,36,26,23,26,64,69,6,1
Japan,45,42,83,72,8,47,44,65,3,60,2
US,93,18,78,83,75,15,38,84,51,58,3
Russia,37,6,48,83,38,93,96,91,6,26,4
England,23,85,33,69,52,6,40,62,81,8,5
France,12,23,35,68,53,41,11,40,28,76,6
Germany,34,71,68,98,67,53,52,9,59,92,7
Bhutan,38,96,77,37,80,7,69,34,71,23,8
Fiji,16,42,96,26,29,59,4,67,13,65,9


## Conditional Selection

In [58]:
df >=50

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
India,True,False,False,True,True,True,False,True,True,True,False
Australia,True,True,False,False,False,False,False,True,True,False,False
Japan,False,False,True,True,False,False,False,True,False,True,False
US,True,False,True,True,True,False,False,True,True,True,False
Russia,False,False,False,True,False,True,True,True,False,False,False
England,False,True,False,True,True,False,False,True,True,False,False
France,False,False,False,True,True,False,False,False,False,True,False
Germany,False,True,True,True,True,True,True,False,True,True,False
Bhutan,False,True,True,False,True,False,True,False,True,False,False
Fiji,False,False,True,False,False,True,False,True,False,True,False


In [59]:
df[df>=50]

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
India,95.0,,,58.0,82.0,87.0,,87.0,62.0,85.0,
Australia,87.0,86.0,,,,,,64.0,69.0,,
Japan,,,83.0,72.0,,,,65.0,,60.0,
US,93.0,,78.0,83.0,75.0,,,84.0,51.0,58.0,
Russia,,,,83.0,,93.0,96.0,91.0,,,
England,,85.0,,69.0,52.0,,,62.0,81.0,,
France,,,,68.0,53.0,,,,,76.0,
Germany,,71.0,68.0,98.0,67.0,53.0,52.0,,59.0,92.0,
Bhutan,,96.0,77.0,,80.0,,69.0,,71.0,,
Fiji,,,96.0,,,59.0,,67.0,,65.0,


In [60]:
df[df['c1']>=50]

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11
India,95,13,48,58,82,87,24,87,62,85,0
Australia,87,86,37,36,26,23,26,64,69,6,1
US,93,18,78,83,75,15,38,84,51,58,3
Poland,68,3,29,33,6,78,69,73,48,51,32


In [61]:
df[df['c1']>=50].loc[['India','Bhutan']]

KeyError: 'Passing list-likes to .loc or [] with any missing labels is no longer supported, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike'

In [None]:
 df[(df>50) | (df%20==0)]

### Useful Methods

In [None]:
### RESET INDEX
df.reset_index()

In [None]:
df['new_ind'] = 'a b c d e f g h i j k'.split()

In [None]:
df

In [None]:
df.set_index('new_ind',inplace=True)

In [None]:
df

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe()

## Hierarichal Indexing

In [None]:
index = [['Asia','Asia','Asia','Europe','Europe','Europe','Africa','Africa','Africa'],
         ['Population','GDP','Gini','Population','GDP','Gini','Population','GDP','Gini'],
         [2010,2011,2012,2010,2011,2012,2010,2011,2012]]
data = np.random.randint(0,10,27).reshape(9,3)
cols = ['UN','World Bank','IMF']

In [None]:
df = pd.DataFrame(data=data,index=index,columns=cols)

In [None]:
df

In [None]:
df.index.names

In [None]:
df.index.names = ['Region','Parameter','Year']

In [None]:
df

In [None]:
df.loc['Asia'].loc[['GDP','Gini'],['UN','World Bank']]

In [None]:
df['UN']

In [None]:
df.xs(('Asia','GDP'))

In [None]:
df.xs(key=('GDP',2011),level=['Parameter','Year'])

## Handling missing data

In [62]:
## Creating a dataframe with a dictionary
df = pd.DataFrame({'India':[0,np.nan,2,3,4], 'China':[10,np.nan,12,13,np.nan], 'US':[np.nan,np.nan,np.nan,np.nan,np.nan],'Canada':[20,21,22,23,24]})

In [63]:
df

Unnamed: 0,India,China,US,Canada
0,0.0,10.0,,20
1,,,,21
2,2.0,12.0,,22
3,3.0,13.0,,23
4,4.0,,,24


In [64]:
df.isnull()

Unnamed: 0,India,China,US,Canada
0,False,False,True,False
1,True,True,True,False
2,False,False,True,False
3,False,False,True,False
4,False,True,True,False


In [65]:
df.notnull()

Unnamed: 0,India,China,US,Canada
0,True,True,False,True
1,False,False,False,True
2,True,True,False,True
3,True,True,False,True
4,True,False,False,True


In [66]:
df.dropna()

Unnamed: 0,India,China,US,Canada


In [67]:
df.dropna(axis=1)

Unnamed: 0,Canada
0,20
1,21
2,22
3,23
4,24


In [68]:
df.dropna(thresh=2)

Unnamed: 0,India,China,US,Canada
0,0.0,10.0,,20
2,2.0,12.0,,22
3,3.0,13.0,,23
4,4.0,,,24


In [69]:
df.fillna(20)

Unnamed: 0,India,China,US,Canada
0,0.0,10.0,20.0,20
1,20.0,20.0,20.0,21
2,2.0,12.0,20.0,22
3,3.0,13.0,20.0,23
4,4.0,20.0,20.0,24


In [70]:
df.fillna(method='ffill')

Unnamed: 0,India,China,US,Canada
0,0.0,10.0,,20
1,0.0,10.0,,21
2,2.0,12.0,,22
3,3.0,13.0,,23
4,4.0,13.0,,24


In [71]:
df.fillna(method='bfill')

Unnamed: 0,India,China,US,Canada
0,0.0,10.0,,20
1,2.0,12.0,,21
2,2.0,12.0,,22
3,3.0,13.0,,23
4,4.0,,,24


In [72]:
df.fillna(method='pad')

Unnamed: 0,India,China,US,Canada
0,0.0,10.0,,20
1,0.0,10.0,,21
2,2.0,12.0,,22
3,3.0,13.0,,23
4,4.0,13.0,,24
