In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/loan-data/loan_data.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
print(f'Total no of empty values: {df.isna().sum().sum()}')
df.isna().sum()

In [None]:
df.describe()

In [None]:
# finding unique values in every feature 
def get_unq(df):
    for i in df.columns:
        print(f'{i} - {len(df[i].unique())}')
get_unq(df)

In [None]:
# finding maximum value minimum value of numerical_features
def min_max(df):
    for i in df.columns:
        if df[i].dtypes!='object':
            print(f'{i} -> {sorted(list(df[i]))[0]} to {sorted(list(df[i]))[-1]}')
min_max(df)

In [None]:
# label encoder for categorical data
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['purpose'] = pd.DataFrame(encoder.fit_transform(df['purpose']))

In [None]:
# Data visualisation 
# 1 boxplot
# 2 voilinplot
# 3 barplot
# 4 histlpot

In [None]:
def boxp(df):
    for i in df.columns:
        plt.figure(figsize=(5,7))
        if i!= 'not.fully.paid':
            sns.boxplot(data = df,y = i,x = 'not.fully.paid')
boxp(df)

In [None]:
def voilinp(df):
    for i in df.columns:
        plt.figure(figsize=(5,7))
        if i!= 'not.fully.paid':
            sns.violinplot(x = 'not.fully.paid', y =i, data = df, hue ='not.fully.paid')
voilinp(df)

In [None]:
def catp(df):
    for i in df.columns:
        plt.figure(figsize=(5,7))
        if i!= 'not.fully.paid':
            sns.catplot(x = 'not.fully.paid', y =i, data = df, hue ='not.fully.paid',kind = 'bar')
catp(df)

In [None]:
def histplo(df):
    for i in df.columns:
        plt.figure(figsize=(5,7))
        if i!= 'not.fully.paid':
            sns.histplot(data=df,x = i,bins=30,kde = True,hue='not.fully.paid')
histplo(df)

In [None]:
# feature engineering 
# 1 heat map
# 2 univariate selection
# 3 Extra Trees clf method
# 4 handling of imbalenced data (oversampling)
# 5 handling outliners (filling them with mean)
# 6 feature scaling

In [None]:
plt.figure(figsize=(20,13))
sns.heatmap(df.corr(),linewidths=0.5,annot= True)

In [None]:
x = df.drop(['not.fully.paid'],axis = 1)
y = df['not.fully.paid']

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
best_features = SelectKBest(score_func=chi2,k = len(x.columns))
fit = best_features.fit(x,y)
result = pd.concat([pd.DataFrame(x.columns),pd.DataFrame(fit.scores_)],axis=1)
result

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(x,y)
print(model.feature_importances_)
feature_importances = pd.Series(model.feature_importances_,index = x.columns)
feature_importances.nlargest(10).plot(kind='barh')
plt.show()

In [None]:
def over_samp(x,y):
    print(x.shape)
    from imblearn.over_sampling import RandomOverSampler
    oversam = RandomOverSampler(sampling_strategy='minority')
    X, Y = oversam.fit_resample(x,y)
    print(X.shape)
    return X,Y

X,Y = over_samp(x,y)

In [None]:
df1 = pd.concat([X,Y],axis=1)
df1.head(5)

In [None]:
def handle_outliners(df):
    for i in df.columns:
        q1 = df[i].quantile(0.25)
        q3 = df[i].quantile(0.75)
        
        iqr = q3-q1
        
        low_ext = q1-1.5*iqr
        upr_ext = q3+1.5*iqr
        
        out1 = df[(df[i]<low_ext)].values
        out2 = df[(df[i]>upr_ext)].values
        
        df[i].replace(out1,low_ext,inplace = True)
        df[i].replace(out2,upr_ext,inplace = True)
    
    return df

df2 = handle_outliners(df1)

In [None]:
def scal_er(df):
    from sklearn.preprocessing import Normalizer
    scale = Normalizer()
    scale_data = scale.fit_transform(df)
    return scale_data

X = df2.drop(['not.fully.paid'],axis=1)
y = df2['not.fully.paid']

X = scal_er(X)