## Import libraries and Dataset

In [3]:
import logging

logger = logging.getLogger()

file_handler = logging.FileHandler(filename='project_log.log', mode='w')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

logger.setLevel(logging.DEBUG)
logger.info('PROJECT FILE STARTS RUNNING!')

In [5]:
# Importing libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from imblearn.over_sampling import SMOTE

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_curve # type: ignore

import pickle

import warnings
warnings.filterwarnings('ignore')

logger.info('LIBRARIES IMPORTED')

ModuleNotFoundError: No module named 'pandas'

In [11]:
# Importing Data
CC_DF = pd.read_csv('UCI_Credit_Card.csv')

In [12]:
logger.info('DATA IMPORTED AND SAVING INTO PANDAS DATAFRAME!')

## EDA

In [13]:
logger.info('EDA STARTED')

In [None]:
print('Number of Columns - ',len(CC_DF.columns))
print('Names of columns - ',CC_DF.columns)

There are 25(including ID) variables:

    ID: ID of each client
    LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit
    SEX: Gender (1=male, 2=female)
    EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)
    MARRIAGE: Marital status (1=married, 2=single, 3=others)
    AGE: Age in years
    PAY_0: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, … 8=payment delay for eight months, 9=payment delay for nine months and above)
    PAY_2: Repayment status in August, 2005 (scale same as above)
    PAY_3: Repayment status in July, 2005 (scale same as above)
    PAY_4: Repayment status in June, 2005 (scale same as above)
    PAY_5: Repayment status in May, 2005 (scale same as above)
    PAY_6: Repayment status in April, 2005 (scale same as above)
    BILL_AMT1: Amount of bill statement in September, 2005 (NT dollar)
    BILL_AMT2: Amount of bill statement in August, 2005 (NT dollar)
    BILL_AMT3: Amount of bill statement in July, 2005 (NT dollar)
    BILL_AMT4: Amount of bill statement in June, 2005 (NT dollar)
    BILL_AMT5: Amount of bill statement in May, 2005 (NT dollar)
    BILL_AMT6: Amount of bill statement in April, 2005 (NT dollar)
    PAY_AMT1: Amount of previous payment in September, 2005 (NT dollar)
    PAY_AMT2: Amount of previous payment in August, 2005 (NT dollar)
    PAY_AMT3: Amount of previous payment in July, 2005 (NT dollar)
    PAY_AMT4: Amount of previous payment in June, 2005 (NT dollar)
    PAY_AMT5: Amount of previous payment in May, 2005 (NT dollar)
    PAY_AMT6: Amount of previous payment in April, 2005 (NT dollar)
    default.payment.next.month: Default payment (1=yes, 0=no)


In [15]:
# ID is a uniqe and diffrent for customer to customr, So dropping it
CC_DF.drop('ID',axis=1,inplace=True)

In [None]:
# Lets see the basic information of the data
CC_DF.info()

        > All the 24 column contains numbers with data type int or float
        > There are 30000 cutomer data
        > There 0 missing values in the data 

In [None]:
sns.countplot(x=CC_DF['SEX'])
plt.xticks(ticks=[0,1],labels=['Male','Female'])
plt.show

> Compare to Male Female Creadit card users are more

In [None]:
sns.countplot(x=CC_DF['SEX'],hue=CC_DF['default.payment.next.month'])
plt.xticks(ticks=[0,1],labels=['Male','Female'])
plt.show

> Female are holding more credit cards then male but the defult is low compare to the male defaulters.
> Male are holding less number of card compare to female but the defaulters  rate is high

In [None]:
#MARRIAGE: Marital status (1=married, 2=single, 3=others)
CC_DF['MARRIAGE'].value_counts()

In [20]:
# As the marriage status will be married or not and there are 323+54 data which is coming under other, I will convert them to singal(2)
CC_DF['MARRIAGE']=CC_DF['MARRIAGE'].apply(lambda x : 2 if x >2 or x < 1 else x)

CC_DF['MARRIAGE'].value_counts()

In [None]:
sns.countplot(x=CC_DF['MARRIAGE'])

Now both are almost equl

In [None]:
CC_DF.head()

In [None]:
#EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)
CC_DF['EDUCATION'].value_counts()

In [24]:
# As 5,6,0 are unknown I am consedaring under others
CC_DF['EDUCATION']=CC_DF['EDUCATION'].apply(lambda x : 4 if x >4 or x < 1 else x)

In [None]:
CC_DF['EDUCATION'].value_counts()

In [None]:
sns.countplot(x=CC_DF['EDUCATION'],hue=CC_DF['SEX'])
plt.xticks(ticks=[0,1,2,3],labels=['graduate school','university','high school','others'])
plt.show

From the above we can under stand Females are more aeducated then males

## Handling Outliers

In [27]:
logger.info('HANDALING OUTLIERS STARTS!')

In [None]:
CC_DF.head()

#### Limit Balance Column

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,2)

plt.subplot(1,2,1)
sns.distplot(CC_DF['LIMIT_BAL'])

plt.subplot(1,2,2)
sns.boxplot(CC_DF['LIMIT_BAL'])

plt.show

print('skewness - ',CC_DF['LIMIT_BAL'].skew())

### OBSERVATIONS :
        > Limit balance column is skewed to the left and skewness is near to 1.
        > In box plot we can see there are outliers to be handeled.
        > We will handel the outliers by using quantiles due to the skewed data.
        
        > As we apply quantiles we loss big number of data, So instant of removeing outliers I will capp them at upper_limit

In [None]:
percentile25=CC_DF['LIMIT_BAL'].quantile(0.25) # Calculating 25th Quantile
percentile75=CC_DF['LIMIT_BAL'].quantile(0.75) # Calculating 75th Quantile
iqr=percentile75-percentile25 # Finding Inter Quantile Range
upper_limit = percentile75 + 1.5 * iqr
lower_limit = percentile25 - 1.5 * iqr
print('25th percentile - ',percentile25,'75th percentile - ',percentile75)
print('Upper Limit - ',upper_limit,'Lower Limit - ',lower_limit)

In [31]:
CC_DF['LIMIT_BAL'] = np.where(CC_DF['LIMIT_BAL'] > upper_limit, upper_limit,
                 np.where(CC_DF['LIMIT_BAL'] < lower_limit, lower_limit,CC_DF['LIMIT_BAL']))

In [None]:
# Ploting box Plot After Outlier capping
sns.boxplot(CC_DF['LIMIT_BAL'])
plt.show

All Outliers are capped according to the limits

#### Age Column

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,2)

plt.subplot(1,2,1)
sns.distplot(CC_DF['AGE'])

plt.subplot(1,2,2)
sns.boxplot(CC_DF['AGE'])

plt.show

print('skewness - ',CC_DF['AGE'].skew())

### OBSERVATIONS :
        > Age column is skewed to the left.
        > In box plot we can see there are outliers to be handeled.
        > We will handel the outliers by using quantiles due to the skewed data.
        
        > As we apply quantiles we loss big number of data, So instant of removeing outliers I will capp them at upper_limit

In [None]:
percentile25=CC_DF['AGE'].quantile(0.25) # Calculating 25th Quantile
percentile75=CC_DF['AGE'].quantile(0.75) # Calculating 75th Quantile
iqr=percentile75-percentile25 # Finding Inter Quantile Range
upper_limit = percentile75 + 1.5 * iqr
lower_limit = percentile25 - 1.5 * iqr
print('25th percentile - ',percentile25,'75th percentile - ',percentile75)
print('Upper Limit - ',upper_limit,'Lower Limit - ',lower_limit)

In [35]:
CC_DF['AGE'] = np.where(CC_DF['AGE'] > upper_limit, upper_limit,
                 np.where(CC_DF['AGE'] < lower_limit, lower_limit,CC_DF['AGE']))

In [None]:
# Ploting box Plot After Outlier capping
sns.boxplot(CC_DF['AGE'])
plt.show

In [None]:
CC_DF['LIMIT_BAL']

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,2,2)

plt.subplot(1,2,1)
sns.distplot(CC_DF['LIMIT_BAL'])

plt.subplot(1,2,2)
sns.boxplot(CC_DF['LIMIT_BAL'])

plt.show

print('skewness - ',CC_DF['LIMIT_BAL'].skew())

In [39]:
logger.info('HANDALING OUTLIERS ENDS!')

In [None]:
plt.figure(figsize=(20,15))
sns.heatmap(CC_DF.corr(),annot=True)
plt.show

In [41]:
logger.info('EDA ENDS!')

## Handeling Imbalnced Data

In [42]:
logger.info('HANDALING IMBALENCED DATA STARTS!')

In [None]:
print('Non Defaulter count - ',CC_DF['default.payment.next.month'].value_counts()[0])
print('Non Defaulter Precent - ',((CC_DF['default.payment.next.month'].value_counts()[0])/len(CC_DF))*100,' %')
print('Defaulter count - ',CC_DF['default.payment.next.month'].value_counts()[1])
print('Defaulter Precent - ',((CC_DF['default.payment.next.month'].value_counts()[1])/len(CC_DF))*100,' %')

In [None]:
sns.countplot(x=CC_DF['default.payment.next.month'])

    > We can see it is an imbalance data set with 22% Defaulters and 78% Non-Defaulters
    > To handel this we have to use Under or Over Sampling

## Over Sampling

In [45]:
logger.info('HANDALING IMBALENCED DATA - OVER SAMPLEING STARTS!')

In [46]:
# Initialise SMOTE object
smote = SMOTE()

In [47]:
# Separate independent and dependent variables
X = CC_DF.drop('default.payment.next.month',axis=1)
y = CC_DF['default.payment.next.month']

In [48]:
# fit predictor and target variable
x_smote, y_smote = smote.fit_resample(X, y)

In [None]:
# Combine balanced X and y
CC_DF_final = pd.DataFrame(x_smote, columns=CC_DF.columns[:-1])
CC_DF_final['default'] = y_smote

CC_DF_final.head()

In [50]:
logger.info('HANDALING IMBALENCED DATA - OVER SAMPLEING ENDS!')

In [None]:
sns.countplot(x=CC_DF_final['default'])

Now data set is balanced

In [52]:
logger.info('HANDALING IMBALENCED DATA ENDS!')

## Splitting

In [53]:
logger.info('DATA PREPARING FOR MODELING STARTS!')

In [54]:
X = CC_DF_final.drop('default',axis=1)
y = CC_DF_final['default']

In [55]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)

In [None]:
X_train

### Scaling the data

In [57]:
scaler = StandardScaler()

In [58]:
X_train_scaled = scaler.fit_transform(X_train)

In [59]:
logger.info('DATA PREPARING FOR MODELING ENDS!')

## Model Building 

In [60]:
logger.info('MODELING STARTS!')

In [61]:
# Helper Function for printing Accuracy matrices, plotting Confusiuon matrix and ROC curve

def helper(model,X_train,y_train,X_test,y_test):
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)

    train_acc = accuracy_score(train_pred, y_train)
    test_acc = accuracy_score(test_pred, y_test)
    prec = precision_score(y_test, test_pred)
    recc = recall_score(y_test, test_pred)
    f1 = f1_score(y_test, test_pred)
    acc_matrices = {'Train accuracy':train_acc,'Test accuracy':test_acc,'Precision':prec,'Recall':recc,'F1 Score':f1}

    print('\nTraining Accuracy Score: ',train_acc)
    print('Testing Accuracy Score: ',test_acc)
    print('Precision on test data: ',prec)
    print('Recall on test data: ',recc)
    print('F1 score on test data: ',f1)
    print('\n========================================================')
    print('Classification Report on Train data')
    print(classification_report(train_pred, y_train))
    print('\n========================================================')
    print('Classification Report on Test data')
    print(classification_report(test_pred, y_test))

    print('\n========================================================')
    # Plotting Confusion Matrix and ROC curve
    f,ax =  plt.subplots(1,2,figsize=(14,6))
    #plt.figure(figsize=(6,4))
    ConfMatrix = confusion_matrix(test_pred, y_test)
    sns.heatmap(ConfMatrix,annot=True, cmap='YlGnBu', fmt="d", 
                xticklabels = ['Non-default', 'Default'], 
                yticklabels = ['Non-default', 'Default'],linewidths=.5,ax = ax[0])
    ax[0].set_ylabel('True label')
    ax[0].set_xlabel('Predicted label')
    ax[0].set_title('Confusion Matrix')

    global fpr,tpr,thresholds
    fpr,tpr,thresholds = roc_curve(test_pred, y_test)
    ax[1].plot(fpr,tpr,color = 'r')
    ax[1].plot(fpr,fpr,color = 'green')
    ax[1].set_ylabel('TPR')
    ax[1].set_xlabel('FPR')
    ax[1].set_title('ROC Curve')
    plt.show()
    return acc_matrices

### 1. Logistic Regression

In [62]:
logger.info('MODEL - LOGISTIC REGRESSION STARTS!')

In [63]:
LogR = LogisticRegression()

In [None]:
LogR.fit(X_train_scaled,y_train)

In [65]:
X_test_scaled = scaler.transform(X_test)

In [None]:
helper(LogR,X_train_scaled,y_train,X_test_scaled, y_test)

In [67]:
logger.info('MODEL - LOGISTIC REGRESSION ENDS!')

### 2. Decicion Tree

In [68]:
logger.info('MODEL - DECICION TREE STARTS!')

In [69]:
D_tree = DecisionTreeClassifier()

In [None]:
D_tree.fit(X_train_scaled,y_train)

In [None]:
helper(D_tree,X_train_scaled,y_train,X_test_scaled,y_test)

In [72]:
logger.info('MODEL - DECICION TREE ENDS!')

### 3. Random Forest

In [73]:
logger.info('MODEL - RANDOM FOREST STARTS!')

In [74]:
RF = RandomForestClassifier()

In [None]:
RF.fit(X_train_scaled,y_train)

In [None]:
helper(RF,X_train_scaled,y_train,X_test_scaled,y_test)

In [77]:
logger.info('MODEL - RANDOM FOREST ENDS!')

### 4. Support Vector Machine

In [78]:
logger.info('MODEL - SUPPORT VECTOR MACHINE STARTS!')

In [79]:
svm = SVC()

In [None]:
svm.fit(X_train_scaled,y_train)

In [None]:
helper(svm,X_train_scaled,y_train,X_test_scaled,y_test)

In [82]:
logger.info('MODEL - SUPPORT VECTOR MACHINE ENDS!')

In [83]:
model={'Logistic Regression':{'Train accuracy': 0.7238201221252069,
 'Test accuracy': 0.714261256634138,
 'Precision': 0.7172472387425658,
 'Recall': 0.716030534351145,
 'F1 Score': 0.7166383701188456},'Decicion Tree':{'Train accuracy': 0.9996290589510929,
 'Test accuracy': 0.7574045540147235,
 'Precision': 0.7573566504119724,
 'Recall': 0.7640373197625105,
 'F1 Score': 0.7606823171761525},'Random Forest':{'Train accuracy': 0.9995719910974148,
 'Test accuracy': 0.8413799007019346,
 'Precision': 0.8594806118818926,
 'Recall': 0.8196776929601357,
 'F1 Score': 0.8391074064426501},'SVC':{'Train accuracy': 0.7722136620441705,
 'Test accuracy': 0.7546652970381784,
 'Precision': 0.7730304669190553,
 'Recall': 0.7273960983884648,
 'F1 Score': 0.7495193148051039}}

In [84]:
model=pd.DataFrame(model)

In [None]:
model

    All model Accuracy was good but Decicion Tree and Random Forest was to good
    Decicion Tree vs Random Frorest
        > Both training accuracy was good but Random Forest test accuracy was also high
        > F1 Score is also high for Random Forest 
        > So i will go with Random Forest

In [86]:
logger.info('MODELING ENDS!')

## Feature Engineering

In [87]:
logger.info('FEATURE ENGINEERING STARTS!')

In [88]:
CC_DF_FE = CC_DF_final.copy()

In [None]:
CC_DF_FE

In [None]:
CC_DF_FE.columns

#### AVG_BILL_AMT (creating new column)

In [91]:
CC_DF_FE['AVG_BILL_AMT']=(CC_DF_FE['BILL_AMT1']+CC_DF_FE['BILL_AMT2']+CC_DF_FE['BILL_AMT3']+CC_DF_FE['BILL_AMT4']+CC_DF_FE['BILL_AMT5']+CC_DF_FE['BILL_AMT6']/6)

#### AVG_PAY_AMT (creating new column)

In [92]:
CC_DF_FE['AVG_PAY_AMT']=(CC_DF_FE['PAY_0']+CC_DF_FE['PAY_2']+CC_DF_FE['PAY_3']+CC_DF_FE['PAY_4']+CC_DF_FE['PAY_5']+CC_DF_FE['PAY_6']/6)

In [None]:
CC_DF_FE.tail()

In [94]:
X=CC_DF_FE.drop('default',axis=1)
y=CC_DF_FE['default']

In [95]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

In [96]:
X_train_scaled=scaler.fit_transform(X_train)

In [97]:
X_test_scaled=scaler.transform(X_test)

In [None]:
RF.fit(X_train_scaled,y_train)

In [None]:
helper(RF,X_train_scaled,y_train,X_test_scaled,y_test)

In [100]:
model['Random Forest with AVG_BILL']=[0.9996290589510929,0.8415511042629686,0.857646229739253,0.8235493148367451,0.8402520065590748]
model['Random Forest with AVG_PAY']=[0.9996861268047709,0.837527820578668,0.8543620598124225,0.8178892088768422,0.8357278864462524]

In [101]:
logger.info('FEATURE ENGINEERING ENDS!')

In [None]:
model.apply(lambda x:x*100)

    Feature Enginering
        > After creating new column Average Bill Amount it healped model F1 score.
        > Average Pay Amount was not helpful it actule draged down the F1 Score to 83.5%
        > So I will continue with AVG_BILL and Drop AVG_PAY column

# Final model training with Random Forest and new column AVG_BILL

In [103]:
logger.info('FINAL MODEL BUILDING STARTS!')

In [None]:
CC_DF_final.head()

In [105]:
CC_DF_final['AVG_BILL_AMT']=(CC_DF_final['BILL_AMT1']+CC_DF_final['BILL_AMT2']+CC_DF_final['BILL_AMT3']+CC_DF_final['BILL_AMT4']+CC_DF_final['BILL_AMT5']+CC_DF_final['BILL_AMT6']/6)

In [None]:
CC_DF_final.tail()

In [107]:
X=CC_DF_final.drop('default',axis=1)
y=CC_DF_final['default']

In [108]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)

In [109]:
X_train_scaled=scaler.fit_transform(X_train)

In [110]:
X_test_scaled=scaler.transform(X_test)

In [111]:
RF_F=RandomForestClassifier()

In [None]:
RF_F.fit(X_train_scaled,y_train)

In [None]:
helper(RF_F,X_train_scaled,y_train,X_test_scaled,y_test)

In [114]:
fin_m=({'Scores':{'Train accuracy': 0.9996290589510929,
 'Test accuracy': 0.8425783256291731,
 'Precision': 0.859192348565356,
 'Recall': 0.8229007633587786,
 'F1 Score': 0.8406550558877048}})
final_model= pd.DataFrame(fin_m)

In [None]:
final_model

In [116]:
logger.info(final_model)

In [117]:
logger.info('FINAL MODEL BUILDING ENDS!')

## Saving the file

In [118]:
logger.info('SAVING THE MODEL...')

In [119]:
pickle.dump(RF_F, open('ccdp.pkl', 'wb'))

logger.info('MODEL SAVED!')

In [120]:
pickle.dump(scaler, open('scaler.pkl', 'wb'))

#### Compress the model.pkl, to easy the process of deployement

In [123]:
import bz2 as bz2

In [124]:
def compressed_pickle(title, data):
  with bz2.BZ2File(title + '.pbz2', 'w') as f:
    pickle.dump(data, f)

In [125]:
compressed_pickle('ccdp', RF_F)

In [126]:
logger.info('PROJECT RUNNING ENDS!')