In [2]:
# import libraries
import numpy as np
import pandas as pd

In [11]:

raw_train = pd.read_csv('prediction_model/datasets/train.csv')

In [12]:
raw_train.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [30]:
train_df = raw_train.copy()

In [31]:

train_y = train_df['Loan_Status'].copy()
train_df.drop(columns = ['Loan_Status'],inplace=True)


In [33]:
train_df.drop(columns='Loan_ID',inplace=True)


In [32]:
train_df[train_df.duplicated()]

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area


In [18]:
# Numeric --> mean
# Categorical --> mode

num_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term']

cat_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
           'Credit_History', 'Property_Area']


In [1]:

from sklearn.base import BaseEstimator,TransformerMixin
import numpy as np
import pandas as pd

class ModeImputer(BaseEstimator,TransformerMixin):
    def __init__(self,variables=None):
        self.variables = variables
    
    def fit(self,X,y=None):
        self.mode_dict = {}
        for col in self.variables:
            self.mode_dict[col] = X[col].mode()
        return self
    
    def transform(self,X):
        X = X.copy()
        for col in self.variables:
            X[col].fillna(self.mode_dict[col],inplace=True)
        return X
    
class MeanImputer(BaseEstimator,TransformerMixin):
    def __init__(self,variables=None):
        self.variables = variables
    
    def fit(self,X,y=None):
        self.mean_dict = {}
        for col in self.variables:
            self.mean_dict[col] = X[col].mean()
        return self
    
    def transform(self,X):
        X = X.copy()
        for col in self.variables:
            X[col].fillna(self.mean_dict[col],inplace=True)
        return X
    
class DropColumns(BaseEstimator,TransformerMixin):
    def __init__(self,variables_to_drop=None):
        self.variables_to_drop = variables_to_drop
    
    def fit(self,X,y=None):
        return self

    def transform(self,X):
        X = X.copy()
        X= X.drop(columns = self.variables_to_drop)
        return X
    
class DomainProcessing(BaseEstimator,TransformerMixin):
    def __init__(self,variable_to_modify=None,variable_to_add=None):
        self.variable_to_modify = variable_to_modify
        self.variable_to_add = variable_to_add
    
    def fit(self,X,y=None):
        return self

    def transform(self,X):
        X = X.copy()
        for feature in self.variable_to_modify:
            X[feature]= X[feature]+X[self.variable_to_add]
        return X
    

class CustomLabelEncoder(BaseEstimator,TransformerMixin):
    def __init__(self,variables=None):
        self.variables = variables
    
    def fit(self,X,y=None):
        self.label_dict={}
        for var in self.variables:
            t=X[var].value_counts().sort_values(ascending=True).index
            self.label_dict[var]= {k:i for i,k in enumerate(t,0)}
        return self

    def transform(self,X):
        X = X.copy()
        for feature in self.variables:
            X[feature]=X[feature].map(self.label_dict[feature])
        return X
    

# Log transformation

class LogTransforms(BaseEstimator,TransformerMixin):
    def __init__(self,variables=None):
        self.variables = variables
    
    def fit(self,X,y=None):
        return self

    def transform(self,X):
        X = X.copy()
        for col in self.variables:
            X[col]=np.log(X[col])
        return X

In [None]:
# classification_pipeline = Pipeline(
#     [
#         ('Mean Imputation :', pp.MeanImputer(variables=config.NUM_FEATURES)),
#         ('Mode Imputation :', pp.ModeImputer(variables=config.CAT_FEATURES)),
#         ('DomainProcessing :', pp.DomainProcessing(variable_to_modify=config.FEATURE_TO_MODIFY,variable_to_add=config.FEATURE_TO_ADD)),
#         ('Drop Columns :', pp.DropColumns(variables_to_drop=config.DROP_FEATURES)),
#         ('Label Encoder',pp.CustomLabelEncoder(variables=config.FEATURES_TO_ENCODE)),
#         ('log Transformation :', pp.LogTransforms(variables=config.LOG_FEATURES)),
#         ('Min Max Scaling :', MinMaxScaler()),
#         ('LogisticClassifier :', LogisticRegression(random_state=0))
#     ]

# )

In [38]:

train_df[num_cols].head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term
0,5849,0.0,,360.0
1,4583,1508.0,128.0,360.0
2,3000,0.0,66.0,360.0
3,2583,2358.0,120.0,360.0
4,6000,0.0,141.0,360.0


In [35]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
dtypes: float64(4), int64(1), object(6)
memory usage: 52.9+ KB


In [39]:
me=MeanImputer(variables=num_cols)
me.fit(train_df[num_cols])
train_df[num_cols]=me.transform(train_df[num_cols])


In [41]:
train_df[num_cols].isna().sum()

ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
dtype: int64

In [44]:
mo=ModeImputer(cat_cols)
mo.fit(train_df[cat_cols])
train_df[cat_cols]=mo.transform(train_df[cat_cols])

In [45]:
train_df[cat_cols].isna().sum()

Gender            13
Married            3
Dependents        15
Education          0
Self_Employed     32
Credit_History    50
Property_Area      0
dtype: int64

In [9]:
os.getcwd()

'/Users/chetu/Learning/mlops/mlops-bootcamp/packaging-ml-model'