In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [2]:
raw_train = pd.read_csv("train.csv")
raw_test = pd.read_csv("test.csv")

In [3]:
train_df = raw_train.copy()
test_df  = raw_test.copy()

In [4]:
train_y = train_df['Loan_Status'].copy()

In [5]:
train_df.drop(columns=['Loan_Status'], inplace=True)

In [6]:
#dropping the unnecessary colums
train_df.drop(columns="Loan_ID", inplace=True)
test_df.drop(columns="Loan_ID", inplace=True)

In [7]:
train_df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')

In [8]:
train_df[train_df.duplicated()]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area


In [9]:
test_df[test_df.duplicated()]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
195,Male,No,0,Graduate,Yes,5833,0,116.0,360.0,1.0,Urban


In [10]:
test_df.drop_duplicates(inplace=True)

In [11]:
test_df[test_df.duplicated()]

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area


In [12]:
num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount','Loan_Amount_Term']
cat_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed','Credit_History', 'Property_Area']

In [54]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
dtypes: float64(4), int64(1), object(6)
memory usage: 52.9+ KB


In [13]:
from sklearn.base import BaseEstimator, TransformerMixin

In [28]:
class MeanImputer(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None):
        self.variables = variables
    
    def fit(self, X, y=None):
        self.mean_dict = {}
        for col in self.variables:
            self.mean_dict[col]=X[col].mean()
        return self
    
    def transform(self, X):
        X = X.copy()
        for col in self.variables:
            X[col] = X[col].fillna(self.mean_dict[col])
            #X[col].fillna(self.mean_dict[col], inplace=True)
        return X

In [29]:
mean_imputer = MeanImputer(variables=num_cols)

In [22]:
mean_imputer

In [30]:
mean_imputer.fit(train_df[num_cols])

In [31]:
mean_imputer.mean_dict

{'ApplicantIncome': np.float64(5403.459283387622),
 'CoapplicantIncome': np.float64(1621.2457980271008),
 'LoanAmount': np.float64(146.41216216216216),
 'Loan_Amount_Term': np.float64(342.0)}

In [55]:
test_d= mean_imputer.transform(train_df[num_cols])

In [56]:
test_d.isnull().sum()

ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
dtype: int64

In [141]:
class ModeImputer(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None):
        self.variables = variables
    
    def fit(self, X, y=None):
        self.mode_dict = {}
        for col in self.variables:
            self.mode_dict[col]=X[col].mode()
        return self
    
    def transform(self, X):
        X = X.copy()
        for col in self.variables:
             X[col] = X[col].fillna(self.mode_dict[col][0])
        return X


In [128]:
model_dict={}
for col in train_df[cat_cols]:
    model_dict[col]=train_df[col].mode()
model_dict['Gender'][0]

'Male'

In [136]:
mode_imputer = ModeImputer(variables=cat_cols)

In [137]:
mode_imputer.fit(train_df[cat_cols])

In [138]:
mode_imputer.mode_dict

{'Gender': 0    Male
 Name: Gender, dtype: object,
 'Married': 0    Yes
 Name: Married, dtype: object,
 'Dependents': 0    0
 Name: Dependents, dtype: object,
 'Education': 0    Graduate
 Name: Education, dtype: object,
 'Self_Employed': 0    No
 Name: Self_Employed, dtype: object,
 'Credit_History': 0    1.0
 Name: Credit_History, dtype: float64,
 'Property_Area': 0    Semiurban
 Name: Property_Area, dtype: object}

In [139]:
test_c = mode_imputer.transform(train_df[cat_cols])

In [140]:
test_c.isnull().sum()

Gender            13
Married            3
Dependents        15
Education          0
Self_Employed     32
Credit_History    50
Property_Area      0
dtype: int64

In [50]:
from sklearn.pipeline import Pipeline
classification_pipeline = Pipeline(
    steps = [
       
        ('MeanImputation', MeanImputer(variables=num_cols)),
        ('ModeImputation', ModeImputer(variables=cat_cols)),
       
    ]
)

In [47]:
def check_missing(X):
    print('Number of missing values after imputation: {}'.
          format(pd.DataFrame(X).isnull().sum().sum()))

In [51]:
imputed_res = classification_pipeline.fit_transform(train_df)

In [53]:
check_missing(imputed_res)

Number of missing values after imputation: 113
