# Pandas 12 Utils

In [94]:
import pandas as pd
import numpy as np
import scipy.stats as stat

data = pd.read_csv('./res/train.csv', index_col='Loan_ID')

data[:10]

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


## 1. boolean index

In [95]:
conds = (data['Gender']=='Female') & (data['Education']=='Not Graduate') & (data['Loan_Status'] == 'Y')
print(conds[:10])
data.loc[conds, ['Gender', 'Education', 'Loan_Status']]

Loan_ID
LP001002    False
LP001003    False
LP001005    False
LP001006    False
LP001008    False
LP001011    False
LP001013    False
LP001014    False
LP001018    False
LP001020    False
dtype: bool


Unnamed: 0_level_0,Gender,Education,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LP001155,Female,Not Graduate,Y
LP001669,Female,Not Graduate,Y
LP001692,Female,Not Graduate,Y
LP001908,Female,Not Graduate,Y
LP002300,Female,Not Graduate,Y
LP002314,Female,Not Graduate,Y
LP002407,Female,Not Graduate,Y
LP002489,Female,Not Graduate,Y
LP002502,Female,Not Graduate,Y
LP002534,Female,Not Graduate,Y


## 2. apply function

In [96]:
def num_null(x):
    return sum(pd.isnull(x))

print('column null count:')
print(data.apply(func=num_null, axis=0))
print('\nrow null count:')
print(data.apply(func=num_null, axis=1).head())

column null count:
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

row null count:
Loan_ID
LP001002    1
LP001003    0
LP001005    0
LP001006    0
LP001008    0
dtype: int64


## 3. fill null value

In [99]:
s1 = data['Gender']
g = s1.mode()
print("type:", type(g), " col0:", g[0])
# stat.mode(s1.dropna())
data['Gender'].fillna(g[0], inplace=True)
data['Married'].fillna(data['Married'].mode()[0], inplace=True)
data['Self_Employed'].fillna(data['Self_Employed'].mode()[0], inplace=True)

print(sum(data['Gender'].isnull()), sum(data['Married'].isnull()), sum(data['Self_Employed'].isnull()))

type: <class 'pandas.core.series.Series'>  col0: Male
0 0 0


## 4. pivot table

In [98]:
# 2 x 2 x 2
v1 = data.pivot_table(values=['LoanAmount'], index=['Gender', 'Married', 'Self_Employed'], aggfunc=np.mean)
print(v1)

                              LoanAmount
Gender Married Self_Employed            
Female No      No             110.596774
               Yes            125.800000
       Yes     No             135.480000
               Yes            282.250000
Male   No      No             127.500000
               Yes            180.588235
       Yes     No             154.017182
               Yes            169.395833


## 5. composite index

In [112]:
print(sum(data['LoanAmount'].isnull()))
for i, row in data.loc[data['LoanAmount'].isnull(), :].iterrows():
    t = tuple([row['Gender'], row['Married'], row['Self_Employed']])
    data.loc[i, 'LoanAmount'] = v1.loc[t].values[0]
 
print("Again check loanAmount na count:")
data.apply(func=num_null, axis=0)

0
Again check loanAmount na count:


Gender                0
Married               0
Dependents           15
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

## 6. crosstab

In [117]:
ct = pd.crosstab(index=data['Credit_History'], columns=data['Loan_Status'], margins=True)
ct

Loan_Status,N,Y,All
Credit_History,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,82,7,89
1.0,97,378,475
All,179,385,564


In [105]:
for i in range(30):
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
