In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#setting pdandas to display max rows and max columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
%%time
#checking the running time
train_data = pd.read_csv("/kaggle/input/home-credit-default-risk/application_train.csv")
test_data = pd.read_csv("/kaggle/input/home-credit-default-risk/application_test.csv")

In [None]:
#checking the memory usage and other information about data
print(train_data.info())
print(test_data.info())

In [None]:
#checking the unique data types
print(train_data.dtypes.unique())
print(test_data.dtypes.unique())

### Lets check if downcasting the dtypes changes the memory usage in our case

In [None]:
#Downcast types to reduce memory usage.
def downcast_dtypes(df):
    _start = df.memory_usage(deep = True).sum()/1024**2
    float_cols = [c for c in df if df[c].dtype == "float32"]
    int_cols = [c for c in df if df[c].dtype == "int8"]
    _end = df.memory_usage(deep = True).sum()/1024**2
    saved = (_start - _end)/ _start*100
    print(f"saved {saved:.2f}%")
    return df
        

In [None]:
train = downcast_dtypes(train_data)
test = downcast_dtypes(test_data)

In [None]:
print(train.info())
print(test.info())

In [None]:
print(train_data.shape)
print(test_data.shape)

In [None]:
print("checking no. of columns in train_data with any null values: ",train_data.isnull().any().sum())
print("checking no. of columns in train_data with all null values: ",  train_data.isnull().all().sum())
print("checking no. of rows in train_data with all null values: ", train_data.isnull().all(axis = 1).sum())
print("list of columns in train_data with null values: ", train_data.columns[train_data.isna().any()].tolist())

In [None]:
print("checking no. of columns in test_data with any null values: ",test_data.isnull().any().sum())
print("checking no. of columns in test_data with all null values: ",  test_data.isnull().all().sum())
print("checking no. of rows in test_data with all null values: ", test_data.isnull().all(axis = 1).sum())
print("list of columns in test_data with null values: ", test_data.columns[test_data.isna().any()].tolist())

In [None]:
train_data.head()

In [None]:
test_data.head()

### Treating negative values

In [None]:
train_data['DAYS_BIRTH'] = train_data['DAYS_BIRTH'].abs()
train_data['DAYS_EMPLOYED'] = train_data['DAYS_EMPLOYED'].abs()
train_data['DAYS_REGISTRATION'] = train_data['DAYS_REGISTRATION'].abs()
train_data['DAYS_ID_PUBLISH'] = train_data['DAYS_ID_PUBLISH'].abs()
train_data['DAYS_LAST_PHONE_CHANGE'] = train_data['DAYS_LAST_PHONE_CHANGE'].abs()


In [None]:
test_data['DAYS_BIRTH'] = test_data['DAYS_BIRTH'].abs()
test_data['DAYS_EMPLOYED'] = test_data['DAYS_EMPLOYED'].abs()
test_data['DAYS_REGISTRATION'] = test_data['DAYS_REGISTRATION'].abs()
test_data['DAYS_ID_PUBLISH'] = test_data['DAYS_ID_PUBLISH'].abs()
test_data['DAYS_LAST_PHONE_CHANGE'] = test_data['DAYS_LAST_PHONE_CHANGE'].abs()

In [None]:
#checking the percentage of null values in the train_data columns having null values.
x=train_data.isnull().sum()
y=(train_data.isnull().sum()/train_data.shape[0])*100
z={'Number of missing values':x,'Percentage of missing values':y}
df=pd.DataFrame(z,columns=['Number of missing values','Percentage of missing values'])
df.sort_values(by='Percentage of missing values',ascending=False).head()

In [None]:
#checking the percentage of null values in the test_data columns having null values.
x=test_data.isnull().sum()
y=(test_data.isnull().sum()/test_data.shape[0])*100
z={'Number of missing values':x,'Percentage of missing values':y}
df=pd.DataFrame(z,columns=['Number of missing values','Percentage of missing values'])
df.sort_values(by='Percentage of missing values',ascending=False).head()

In [None]:
#assigning the null values of train_data to variables 
val= train_data.isnull().sum()
#taking the null values greater then 30%
drop_column = val[val.values >= 92253]
##checking the length columns having null values more then 30%
print(len(drop_column))
#checking the list of columns to be dropped
print(drop_column.index)

In [None]:
#assigning the null values of test_data to variables 
val= test_data.isnull().sum()
#taking the null values greater then 30%
drop_column = val[val.values >= 14624]
##checking the length columns having null values more then 30%
print(len(drop_column))
#checking the list of columns to be dropped
print(drop_column.index)

### before dropping the columns check if any column is under 50% of null values and can still impact the target. We found that column to be "OCCUPATION_TYPE".
### We can see from above table that "OCCUPATION_TYPE" have just 31% of the miss data in train_data and 32% in test_data. we know what is the importance of job type while asking for loan. So we must consider this column.

In [None]:
#Dropping the null values columns having more then 30% of missing values except "OCCUPATION_TYPE" in train_data.
train_data = train_data.drop(['OWN_CAR_AGE', 'EXT_SOURCE_1', 'APARTMENTS_AVG',
       'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG',
       'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG',
       'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG',
       'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG',
       'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE',
       'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE',
       'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE',
       'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE',
       'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI',
       'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI',
       'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI',
       'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI',
       'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'FONDKAPREMONT_MODE',
       'HOUSETYPE_MODE', 'TOTALAREA_MODE', 'WALLSMATERIAL_MODE',
       'EMERGENCYSTATE_MODE'] , axis = 1)

In [None]:
#Dropping the null values columns having more then 30% of missing values except "OCCUPATION_TYPE" in test_data.
test_data = test_data.drop(['OWN_CAR_AGE', 'EXT_SOURCE_1', 'APARTMENTS_AVG',
       'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG',
       'COMMONAREA_AVG', 'ELEVATORS_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG',
       'FLOORSMIN_AVG', 'LANDAREA_AVG', 'LIVINGAPARTMENTS_AVG',
       'LIVINGAREA_AVG', 'NONLIVINGAPARTMENTS_AVG', 'NONLIVINGAREA_AVG',
       'APARTMENTS_MODE', 'BASEMENTAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE',
       'YEARS_BUILD_MODE', 'COMMONAREA_MODE', 'ELEVATORS_MODE',
       'ENTRANCES_MODE', 'FLOORSMAX_MODE', 'FLOORSMIN_MODE', 'LANDAREA_MODE',
       'LIVINGAPARTMENTS_MODE', 'LIVINGAREA_MODE', 'NONLIVINGAPARTMENTS_MODE',
       'NONLIVINGAREA_MODE', 'APARTMENTS_MEDI', 'BASEMENTAREA_MEDI',
       'YEARS_BEGINEXPLUATATION_MEDI', 'YEARS_BUILD_MEDI', 'COMMONAREA_MEDI',
       'ELEVATORS_MEDI', 'ENTRANCES_MEDI', 'FLOORSMAX_MEDI', 'FLOORSMIN_MEDI',
       'LANDAREA_MEDI', 'LIVINGAPARTMENTS_MEDI', 'LIVINGAREA_MEDI',
       'NONLIVINGAPARTMENTS_MEDI', 'NONLIVINGAREA_MEDI', 'FONDKAPREMONT_MODE',
       'HOUSETYPE_MODE', 'TOTALAREA_MODE', 'WALLSMATERIAL_MODE',
       'EMERGENCYSTATE_MODE'] , axis = 1)

In [None]:
print(train_data.shape)
print(test_data.shape)

In [None]:
print("checking the datatypes in our train_data: ")
print(train_data.dtypes.value_counts())
print('\n')
print("checking the datatypes in our test_data: ")
print(test_data.dtypes.value_counts())

In [None]:
print("checking the maximum value of null values in a row of train_data")
print(train_data.isnull().sum(axis=1).sort_values(ascending  = False).head(1))
print('\n')
print("checking the maximum value of null values in a row of test_data")
print(test_data.isnull().sum(axis=1).sort_values(ascending  = False).head(1))

In [None]:
print("checking the unique dtypes of null values in train_data: ")
print(train_data.dtypes[train_data.isnull().any()].unique())
print("\n")
print("checking the unique dtypes of null values in test_data: ")
print(test_data.dtypes[test_data.isnull().any()].unique())

### In the above cell we can see that null values in our dataset are of two data_types . The object can be easily clasiified as categorical data but in numeric that we can still have ordinal data which is again a categorical data. So, we must seperate the ordinal data from numeric data

In [None]:
print(train_data.dtypes[train_data.isnull().any()])
print(len(train_data.dtypes[train_data.isnull().any()]))

In [None]:
print(test_data.dtypes[test_data.isnull().any()])
print(len(test_data.dtypes[test_data.isnull().any()]))

### Data Parsing

In [None]:
train_categorical = train_data.select_dtypes(include = ['object'])
test_categorical = test_data.select_dtypes(include = ['object'])

In [None]:
train_categorical.head()

In [None]:
test_categorical.head()

In [None]:
null_train = train_data[train_data.columns[train_data.isnull().any()]]

In [None]:
null_test = test_data[test_data.columns[test_data.isnull().any()]]

In [None]:
def value_count():
    for i in range(len(null_train.columns)):
        if len(null_train.iloc[:,i].value_counts()) <= 100:
            print(null_train.columns[[i]])
value_count()

In [None]:
def value_count():
    for i in range(len(null_test.columns)):
        if len(null_test.iloc[:,i].value_counts()) <= 100:
            print(null_test.columns[[i]])
value_count()

### Filling numeric null values

In [None]:
print(train_data['AMT_ANNUITY'].describe())
print('\n')
print(test_data['AMT_ANNUITY'].describe())

In [None]:
print(plt.hist(train_data['AMT_ANNUITY']))
print(plt.hist(test_data['AMT_ANNUITY']))

In [None]:
train_data['AMT_ANNUITY'] = train_data['AMT_ANNUITY'].fillna(train_data['AMT_ANNUITY'].median())
test_data['AMT_ANNUITY'] = test_data['AMT_ANNUITY'].fillna(test_data['AMT_ANNUITY'].median())

In [None]:
train_data['AMT_GOODS_PRICE'].describe()

In [None]:
plt.hist(train_data['AMT_GOODS_PRICE'])

In [None]:
train_data['AMT_GOODS_PRICE'].value_counts().head()

In [None]:
print("median: ",train_data['AMT_GOODS_PRICE'].median())
print("mode: ",train_data['AMT_GOODS_PRICE'].mode()[0])
print("mean: ",train_data['AMT_GOODS_PRICE'].mean())

In [None]:
train_data['AMT_GOODS_PRICE'] = train_data['AMT_GOODS_PRICE'].fillna(train_data['AMT_GOODS_PRICE'].median())

In [None]:
train_data['DAYS_LAST_PHONE_CHANGE'].describe()

In [None]:
train_data['DAYS_LAST_PHONE_CHANGE'].describe()

In [None]:
train_data['DAYS_LAST_PHONE_CHANGE'].value_counts().head()

In [None]:
plt.hist(train_data['DAYS_LAST_PHONE_CHANGE'])

In [None]:
print(train_data['DAYS_LAST_PHONE_CHANGE'].mode()[0])
print(train_data['DAYS_LAST_PHONE_CHANGE'].median())

In [None]:
train_data['DAYS_LAST_PHONE_CHANGE'] = train_data['DAYS_LAST_PHONE_CHANGE'].fillna(train_data['DAYS_LAST_PHONE_CHANGE'].median())

In [None]:
train_data['EXT_SOURCE_2'].describe()

In [None]:
train_data['EXT_SOURCE_2'].value_counts().head()

In [None]:
print(train_data['EXT_SOURCE_2'].mean())
print(train_data['EXT_SOURCE_2'].median())
print(train_data['EXT_SOURCE_2'].mode()[0])
print(test_data['EXT_SOURCE_2'].mean())
print(test_data['EXT_SOURCE_2'].median())
print(test_data['EXT_SOURCE_2'].mode()[0])

In [None]:
plt.hist(train_data['EXT_SOURCE_2'])
plt.hist(test_data['EXT_SOURCE_2'])

In [None]:
train_data['EXT_SOURCE_2'] = train_data['EXT_SOURCE_2'].fillna(train_data['EXT_SOURCE_2'].median())
test_data['EXT_SOURCE_2'] = test_data['EXT_SOURCE_2'].fillna(test_data['EXT_SOURCE_2'].median())

In [None]:
print(train_data['EXT_SOURCE_3'].describe())
print('\n')
print(test_data['EXT_SOURCE_3'].describe())

In [None]:
print(train_data['EXT_SOURCE_3'].median())
print(train_data['EXT_SOURCE_3'].mean())
print(train_data['EXT_SOURCE_3'].mode())
print(test_data['EXT_SOURCE_3'].median())
print(test_data['EXT_SOURCE_3'].mean())
print(test_data['EXT_SOURCE_3'].mode())

In [None]:
plt.hist(train_data['EXT_SOURCE_3'])
plt.hist(test_data['EXT_SOURCE_3'])

In [None]:
train_data['EXT_SOURCE_3'] = train_data['EXT_SOURCE_3'].fillna(train_data['EXT_SOURCE_3'].median())
test_data['EXT_SOURCE_3'] = test_data['EXT_SOURCE_3'].fillna(test_data['EXT_SOURCE_3'].median())

### Filling Categorical null values using function

In [None]:
def filling_TrainDataCategoricalNullvalues():
    for i in range(len(null_train.columns)):
        if len(null_train.iloc[:,i].value_counts()) <= 100:
            column = null_train.columns[i]
            train_data[column] = train_data[column].fillna(train_data[column].mode()[0])
filling_TrainDataCategoricalNullvalues()

In [None]:
def filling_TestDataCategoricalNullvalues():
    for i in range(len(null_test.columns)):
        if len(null_test.iloc[:,i].value_counts()) <= 100:
            column = null_test.columns[i]
            test_data[column] = test_data[column].fillna(test_data[column].mode()[0])
filling_TestDataCategoricalNullvalues()

In [None]:
print(train_data.isnull().sum().sum())
print(test_data.isnull().sum().sum())

### checking of duplicate rows and columns

In [None]:
#checking for duplicate rows
train_duplicate = train_data[train_data.duplicated()]
test_duplicate = test_data[test_data.duplicated()]
train_duplicate.head()

In [None]:
test_duplicate.head()

In [None]:
#checking for duplicate columns
def getDuplicateColumns(df):
  
    # Create an empty set
    duplicateColumnNames = set()
      
    # Iterate through all the columns 
    # of dataframe
    for x in range(df.shape[1]):
          
        # Take column at xth index.
        col = df.iloc[:, x]
          
        # Iterate through all the columns in
        # DataFrame from (x + 1)th index to
        # last index
        for y in range(x + 1, df.shape[1]):
              
            # Take column at yth index.
            otherCol = df.iloc[:, y]
              
            # Check if two columns at x & y
            # index are equal or not,
            # if equal then adding 
            # to the set
            if col.equals(otherCol):
                duplicateColumnNames.add(df.columns.values[y])
                  
    # Return list of unique column names 
    # whose contents are duplicates.
    return list(duplicateColumnNames)
getDuplicateColumns(train_data)

In [None]:
getDuplicateColumns(test_data)

### Encoding categorical values
### For categorical data we perform three kinds of encoding . Ordinal enconding , OneHot encoding. These encodings perfromed looking into the data types. lets say we have a categorical data which is related and having rank then we have to perform ordinal encoding. If the categorical data is not displaying any rank order among it self then OneHot encoding can be used. While using OneHot encoding dummy variable trap is taken care.

In [None]:
def value_count():
    for i in range(len(train_categorical.columns)):
        print(train_categorical.columns[[i]])
        print(len(train_categorical.iloc[:,i].value_counts()))
        print('\n')
            
value_count()

In [None]:
def value_count():
    for i in range(len(test_categorical.columns)):
        print(test_categorical.columns[[i]])
        print(len(test_categorical.iloc[:,i].value_counts()))
        print('\n')
            
value_count()

In [None]:
print(train_data['NAME_CONTRACT_TYPE'].unique())
CT = ['Cash loans', 'Revolving loans']

In [None]:
print(train_data['CODE_GENDER'].unique())
print(test_data['CODE_GENDER'].unique())
train_CG = ['M', 'F' ,'XNA']
test_CG = ['M', 'F']

In [None]:
print(train_data['FLAG_OWN_CAR'].unique())
C = ['N', 'Y']

In [None]:
print(train_data['FLAG_OWN_REALTY'].unique())
R = ['N', 'Y']

In [None]:
train_data['NAME_TYPE_SUITE'].unique()

In [None]:
print(train_data['NAME_INCOME_TYPE'].unique())
print(test_data['NAME_INCOME_TYPE'].unique())
train_IT = ['Unemployed', 'Student', 'Pensioner','Working','Commercial associate','State servant','Businessman','Maternity leave' ]
test_IT = ['Unemployed', 'Student', 'Pensioner','Working','Commercial associate','State servant','Businessman' ]

In [None]:
print(train_data['NAME_EDUCATION_TYPE'].unique())
ET = ['Lower secondary','Secondary / secondary special','Incomplete higher','Higher education','Academic degree']

In [None]:
print(train_data['NAME_FAMILY_STATUS'].unique())
print(test_data['NAME_FAMILY_STATUS'].unique())
train_FS = ['Widow','Separated','Single / not married','Married','Civil marriage','Unknown',]
test_FS = ['Widow','Separated','Single / not married','Married','Civil marriage']

In [None]:
train_data['NAME_HOUSING_TYPE'].unique()

In [None]:
train_data['OCCUPATION_TYPE'].unique()

In [None]:
train_data['WEEKDAY_APPR_PROCESS_START'].unique()

In [None]:
train_data['ORGANIZATION_TYPE'].unique()

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
OE_train = OrdinalEncoder(categories = [CT,train_CG, C, R, train_IT, ET, train_FS ])

In [None]:
OE_train.fit(train_data[['NAME_CONTRACT_TYPE','CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY',\
                         'NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS']])

In [None]:
train_data[['NAME_CONTRACT_TYPE','CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','NAME_INCOME_TYPE',\
            'NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS']] =\
OE_train.transform(train_data[['NAME_CONTRACT_TYPE','CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY',\
                              'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS']])

In [None]:
OE_test = OrdinalEncoder(categories = [CT,test_CG, C, R, test_IT, ET, test_FS ])

In [None]:
OE_test.fit(test_data[['NAME_CONTRACT_TYPE','CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY',\
                         'NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS']])

In [None]:
test_data[['NAME_CONTRACT_TYPE','CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY',\
                         'NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS']] =\
OE_test.transform(test_data[['NAME_CONTRACT_TYPE','CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY',\
                         'NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS']])

In [None]:
test_data[['NAME_CONTRACT_TYPE','CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY',\
                         'NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS']].head()

In [None]:
train_data[['NAME_CONTRACT_TYPE','CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY',\
                         'NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS']].head()

In [None]:
train_data.shape

In [None]:
test_data.shape

In [None]:
#Applying one hot encoding.
train_data = pd.get_dummies(train_data , drop_first = True)
test_data = pd.get_dummies(test_data , drop_first = True)

In [None]:
train_data.head()

In [None]:
train_data.shape

In [None]:
test_data.head()

In [None]:
test_data.shape

In [None]:
y = train_data['TARGET']

In [None]:
y.value_counts().plot.bar()

### Splitting our dataset

In [None]:

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_data.drop(['TARGET'], axis = 1),\
                                            train_data['TARGET'],test_size = 0.2,random_state = 42)

In [None]:
x_train.head()

### Balancing Training Data

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE()

In [None]:
xtrain_smote , ytrain_smote = smote.fit_resample(x_train, y_train)

In [None]:
from collections import Counter

In [None]:
print("Before somte: " , Counter(y_train))
print("After smote: ", Counter(ytrain_smote))

### Applying Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc = StandardScaler()

In [None]:
xtrain_smote1 = pd.DataFrame(sc.fit_transform(xtrain_smote))

In [None]:
xtrain_smote1.columns = xtrain_smote.columns.values
xtrain_smote1.index = xtrain_smote.index.values

In [None]:
xtrain_smote = xtrain_smote1

In [None]:
xtrain_smote.head()

In [None]:
x_test1 = pd.DataFrame(sc.transform(x_test))

In [None]:
x_test1.columns = x_test.columns.values
x_test1.index = x_test.index.values

In [None]:
x_test = x_test1

In [None]:
x_test.head()

In [None]:
test_data1 = pd.DataFrame(sc.transform(test_data))

In [None]:
test_data1.columns = test_data.columns.values
test_data1.index = test_data.index.values

In [None]:
test_data = test_data1

In [None]:
test_data1.head()

### Model Bulding

In [None]:
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning,
                        module="sklearn")

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr_c = LogisticRegression()

In [None]:
lr_c.fit(xtrain_smote, ytrain_smote)

In [None]:
y_prd = lr_c.predict(x_test)

In [None]:
#evaluating results

from sklearn.metrics import confusion_matrix , accuracy_score, f1_score,\
precision_score, recall_score

In [None]:
cm = confusion_matrix(y_test, y_prd)
cm

In [None]:
print(accuracy_score(y_test, y_prd))

In [None]:
print(f1_score(y_test, y_prd))
print(precision_score(y_test, y_prd))
print(recall_score(y_test, y_prd))

In [None]:
df_cm = pd.DataFrame(cm , index = (0,1), columns = (0,1))
plt.figure(figsize=(10,7))
sns.set(font_scale = 1.4)
sns.heatmap(df_cm , annot = True , fmt = 'g')
print("test_data accuracy: %0.4f" % accuracy_score(y_test , y_prd))

### Applying K_Fold cross validation

In [None]:

from sklearn.model_selection import cross_val_score

In [None]:
accuracies = cross_val_score(estimator = lr_c , 
                             X = xtrain_smote, 
                             y = ytrain_smote,
                             cv = 3)

In [None]:
accuracies

In [None]:
accuracies.mean()

#### Feature selection
### Information gain - mutual information in classification

In [None]:
from sklearn.feature_selection import mutual_info_classif

In [None]:
#checking the memory usage and other information about data
print(xtrain_smote.info())
print(xtrain_smote.dtypes.unique())

In [None]:
%%time
mutual_info = mutual_info_classif(xtrain_smote , ytrain_smote)

In [None]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = xtrain_smote.columns
mutual_info.sort_values(ascending = False).head()

In [None]:
from sklearn.feature_selection import SelectKBest

In [None]:
select_hundred_columns = SelectKBest(mutual_info_classif, k = 100)

In [None]:
%%time
select_hundred_columns.fit(xtrain_smote, ytrain_smote)

In [None]:
xtrain_smote[xtrain_smote.columns[select_hundred_columns.get_support()]].head()

In [None]:
%%time
lr_c.fit(xtrain_smote[xtrain_smote.columns[select_hundred_columns.get_support()]],ytrain_smote)

In [None]:
y_prediction2 = lr_c.predict(x_test[x_test.columns[select_hundred_columns.get_support()]])

In [None]:
accuracy_score(y_prediction2,y_test)

### Hyperparameter optimization 
### Using GridSearchCv

In [None]:
params = lr_c.get_params()
params

In [None]:
param_grid = [    
    {'penalty' : ['l1', 'l2'],
    'C' : [1,2],
    'solver' : ['lbfgs','liblinear'],
    'max_iter' : [100,200]
    }
]

### Here I have considered very less parameter optimization just to reduce the very long time taken by GridSearchCV to perform the tuning operation. I have just displayed how optimization works focusing less on the result.

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
clf = GridSearchCV(lr_c, param_grid = param_grid, cv = 2, verbose=True, n_jobs=-1)

In [None]:
%%time
best_clf = clf.fit(xtrain_smote[xtrain_smote.columns[select_hundred_columns.get_support()]],ytrain_smote)

In [None]:
best_clf.best_estimator_

### Bulding our model with best parameter values

In [None]:
classification = LogisticRegression(C=1)

In [None]:
classification.fit(xtrain_smote[xtrain_smote.columns[select_hundred_columns.get_support()]],ytrain_smote)

In [None]:
prd = classification.predict(x_test[x_test.columns[select_hundred_columns.get_support()]])

In [None]:
accuracy_score(prd, y_test)

### Applying our test data values on the final model.

In [None]:
testprd = classification.predict(test_data[test_data.columns[select_hundred_columns.get_support()]])

In [None]:
test_prd = pd.DataFrame(testprd)

In [None]:
test_prd.value_counts()