### Importing Data

In [1]:
import os, sys
import joblib
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt 
import plotly.graph_objects as go 

In [9]:
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from explore_data import Data_Preprocessing
from visualize_data import *
from outlier_handler import OutlierHandler 

In [3]:
df = pd.read_csv('../data/cleaned_auction_data.csv')

In [4]:
df.columns

Index(['Unnamed: 0', 'ID', 'Gender', 'Occupation Status', 'Age', 'Education',
       'Marital_Status', 'Dependants', 'State', 'Salary', 'State_Code',
       'Current Loan', 'Tenure', 'Previous Loans', 'Defaulted', 'Default Dur'],
      dtype='object')

In [5]:
df.drop(columns= ['Unnamed: 0'], inplace= True)
df

Unnamed: 0,ID,Gender,Occupation Status,Age,Education,Marital_Status,Dependants,State,Salary,State_Code,Current Loan,Tenure,Previous Loans,Defaulted,Default Dur
0,7a8a26d5997f4f41ab3f0a0ad5c0e108,Male,Employed,25,No Graduate,Not Married,0,Kogi,24261,3,False,0,0,False,0
1,1535bf7047f24618bf74e67227e01d6d,Male,Self Employed,38,Graduate,Not Married,1,Lagos,132213,5,False,0,0,False,0
2,1d45e8e64ebc4cbf8424b721e7f95ffb,Female,Unemployed,47,Graduate,Not Married,1,Delta,5455,2,False,0,0,False,0
3,21c5a288f6b5449d868a682633f6f6fb,Female,Employed,33,Graduate,Married,0,Abuja,385379,5,True,3,2,True,1
4,ece1cac269c74614b82f1a3cec7cb399,Female,Employed,29,No Graduate,Married,4,Kogi,24261,3,False,0,0,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,5728dbdfff1844a79ed1f9361f5d79f8,Male,Self Employed,28,Graduate,Married,0,Abuja,132213,5,False,0,0,False,0
99996,1ea35fd1e6714ecabba1d880e6270c52,Male,Self Employed,25,Graduate,Married,5,Abia,38252,3,False,0,0,False,0
99997,ea1591fe5b3849f886c1a9ad80ef5cea,Male,Self Employed,40,No Graduate,Not Married,0,Edo,38252,3,True,2,2,False,0
99998,ef0502bc88874d438afb8bbffd46078e,Male,Employed,38,No Graduate,Not Married,1,Abuja,385379,5,True,4,2,True,1


### Visualize Data

In [6]:
plotly_plot_hist(df, 'Gender')

In [7]:
plotly_plot_hist(df, 'Education')

In [8]:
plotly_plot_hist(df, 'Occupation Status')

In [7]:
df

Unnamed: 0.1,Unnamed: 0,ID,Gender,Occupation Status,Age,Education,Marital_Status,Dependants,State,Salary,State_Code,Current Loan,Tenure,Previous Loans,Defaulted,Default Dur,newscore,Status
0,0,7a8a26d5997f4f41ab3f0a0ad5c0e108,0,1,25,0,0,0,Kogi,24261.0,3,0,0,0,0,0,101.8962,0
1,1,1535bf7047f24618bf74e67227e01d6d,0,2,38,1,0,1,Lagos,132213.0,5,0,0,0,0,0,555.2946,1
2,2,1d45e8e64ebc4cbf8424b721e7f95ffb,1,0,47,1,0,1,Delta,5455.0,2,0,0,0,0,0,22.9110,0
3,3,21c5a288f6b5449d868a682633f6f6fb,1,1,33,1,1,0,Abuja,273154.5,5,1,3,2,1,1,1147.2489,1
4,4,ece1cac269c74614b82f1a3cec7cb399,1,1,29,0,1,4,Kogi,24261.0,3,0,0,0,0,0,101.8962,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,5728dbdfff1844a79ed1f9361f5d79f8,0,2,28,1,1,0,Abuja,132213.0,5,0,0,0,0,0,555.2946,1
99996,99996,1ea35fd1e6714ecabba1d880e6270c52,0,2,25,1,1,5,Abia,38252.0,3,0,0,0,0,0,160.6584,0
99997,99997,ea1591fe5b3849f886c1a9ad80ef5cea,0,2,40,0,0,0,Edo,38252.0,3,1,2,2,0,0,160.6584,0
99998,99998,ef0502bc88874d438afb8bbffd46078e,0,1,38,0,0,1,Abuja,273154.5,5,1,4,2,1,1,1147.2489,1


### Oulier Handler

### There are variables that need to be converted to categories:

Sex:
    0 - Male
    1 - Female

Occupation:
    0 - Unemplyed 
    1 - Employed
    2 - Self Employed

Marrital Status: 
    0 - Not Married
    1 - Married

Education: 
    0 - Not Graduate
    1 - Graduate

Current Loan: 
    0 - False
    1 - True

Defaulted: 
    0 - False
    1 - True


In [10]:
datapreprocessing = Data_Preprocessing(df)

In [11]:
datapreprocessing.replace_cateorical_data()

Unnamed: 0,ID,Gender,Occupation Status,Age,Education,Marital_Status,Dependants,State,Salary,State_Code,Current Loan,Tenure,Previous Loans,Defaulted,Default Dur
0,7a8a26d5997f4f41ab3f0a0ad5c0e108,0,1,25,0,0,0,Kogi,24261,3,0,0,0,0,0
1,1535bf7047f24618bf74e67227e01d6d,0,2,38,1,0,1,Lagos,132213,5,0,0,0,0,0
2,1d45e8e64ebc4cbf8424b721e7f95ffb,1,0,47,1,0,1,Delta,5455,2,0,0,0,0,0
3,21c5a288f6b5449d868a682633f6f6fb,1,1,33,1,1,0,Abuja,385379,5,1,3,2,1,1
4,ece1cac269c74614b82f1a3cec7cb399,1,1,29,0,1,4,Kogi,24261,3,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,5728dbdfff1844a79ed1f9361f5d79f8,0,2,28,1,1,0,Abuja,132213,5,0,0,0,0,0
99996,1ea35fd1e6714ecabba1d880e6270c52,0,2,25,1,1,5,Abia,38252,3,0,0,0,0,0
99997,ea1591fe5b3849f886c1a9ad80ef5cea,0,2,40,0,0,0,Edo,38252,3,1,2,2,0,0
99998,ef0502bc88874d438afb8bbffd46078e,0,1,38,0,0,1,Abuja,385379,5,1,4,2,1,1


### Creating a Credit Score

Using our features such as Occupation, Educatioon, Marital Status, Dependants, State_Code,Current_Loan, Defaulted and Defaulted_Dur we can create a credit score for each user and set a benchmark for our target variable.

In [12]:
datapreprocessing.get_score()

Unnamed: 0,ID,Gender,Occupation Status,Age,Education,Marital_Status,Dependants,State,Salary,State_Code,Current Loan,Tenure,Previous Loans,Defaulted,Default Dur,newscore
0,7a8a26d5997f4f41ab3f0a0ad5c0e108,0,1,25,0,0,0,Kogi,24261,3,0,0,0,0,0,101.8962
1,1535bf7047f24618bf74e67227e01d6d,0,2,38,1,0,1,Lagos,132213,5,0,0,0,0,0,555.2946
2,1d45e8e64ebc4cbf8424b721e7f95ffb,1,0,47,1,0,1,Delta,5455,2,0,0,0,0,0,22.9110
3,21c5a288f6b5449d868a682633f6f6fb,1,1,33,1,1,0,Abuja,385379,5,1,3,2,1,1,1618.5918
4,ece1cac269c74614b82f1a3cec7cb399,1,1,29,0,1,4,Kogi,24261,3,0,0,0,0,0,101.8962
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,5728dbdfff1844a79ed1f9361f5d79f8,0,2,28,1,1,0,Abuja,132213,5,0,0,0,0,0,555.2946
99996,1ea35fd1e6714ecabba1d880e6270c52,0,2,25,1,1,5,Abia,38252,3,0,0,0,0,0,160.6584
99997,ea1591fe5b3849f886c1a9ad80ef5cea,0,2,40,0,0,0,Edo,38252,3,1,2,2,0,0,160.6584
99998,ef0502bc88874d438afb8bbffd46078e,0,1,38,0,0,1,Abuja,385379,5,1,4,2,1,1,1618.5918


In [13]:
df['newscore'].value_counts()

555.2946     23860
1618.5918    15304
397.1982     14287
160.6584      9664
854.4354      9018
43.9068       7305
101.8962      6031
417.7656      4575
53.4660       4365
83.9958       2630
29.2278       1706
22.9110       1255
Name: newscore, dtype: int64

In [14]:
df['Status'] = df['newscore'].apply(lambda x: 1 if x > 500 else 0)
df

Unnamed: 0,ID,Gender,Occupation Status,Age,Education,Marital_Status,Dependants,State,Salary,State_Code,Current Loan,Tenure,Previous Loans,Defaulted,Default Dur,newscore,Status
0,7a8a26d5997f4f41ab3f0a0ad5c0e108,0,1,25,0,0,0,Kogi,24261,3,0,0,0,0,0,101.8962,0
1,1535bf7047f24618bf74e67227e01d6d,0,2,38,1,0,1,Lagos,132213,5,0,0,0,0,0,555.2946,1
2,1d45e8e64ebc4cbf8424b721e7f95ffb,1,0,47,1,0,1,Delta,5455,2,0,0,0,0,0,22.9110,0
3,21c5a288f6b5449d868a682633f6f6fb,1,1,33,1,1,0,Abuja,385379,5,1,3,2,1,1,1618.5918,1
4,ece1cac269c74614b82f1a3cec7cb399,1,1,29,0,1,4,Kogi,24261,3,0,0,0,0,0,101.8962,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,5728dbdfff1844a79ed1f9361f5d79f8,0,2,28,1,1,0,Abuja,132213,5,0,0,0,0,0,555.2946,1
99996,1ea35fd1e6714ecabba1d880e6270c52,0,2,25,1,1,5,Abia,38252,3,0,0,0,0,0,160.6584,0
99997,ea1591fe5b3849f886c1a9ad80ef5cea,0,2,40,0,0,0,Edo,38252,3,1,2,2,0,0,160.6584,0
99998,ef0502bc88874d438afb8bbffd46078e,0,1,38,0,0,1,Abuja,385379,5,1,4,2,1,1,1618.5918,1


### Modeling 

In [96]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from sklearn import metrics 
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [97]:
df.drop(columns = ['Default Dur', 'ID', 'State', 'newscore'],axis = 1, inplace = True)

KeyError: "['Default Dur', 'ID', 'State', 'newscore'] not found in axis"

In [None]:
X = df.iloc[:,:-1]
# y = df.iloc[:, -1:]
y = df['Status']

In [None]:
X

Unnamed: 0,Gender,Occupation Status,Age,Education,Marital_Status,Dependants,Salary,State_Code,Current Loan,Tenure,Previous Loans,Defaulted
0,0,1,25,0,0,0,24261,3,0,0,0,0
1,0,2,38,1,0,1,132213,5,0,0,0,0
2,1,0,47,1,0,1,5455,2,0,0,0,0
3,1,1,33,1,1,0,385379,5,1,3,2,1
4,1,1,29,0,1,4,24261,3,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0,2,28,1,1,0,132213,5,0,0,0,0
99996,0,2,25,1,1,5,38252,3,0,0,0,0
99997,0,2,40,0,0,0,38252,3,1,2,2,0
99998,0,1,38,0,0,1,385379,5,1,4,2,1


In [None]:
y

0        0
1        1
2        0
3        1
4        0
        ..
99995    1
99996    0
99997    0
99998    1
99999    1
Name: Status, Length: 100000, dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.35)

In [None]:
print(X_train.shape)
print(X_test.shape)

(65000, 12)
(35000, 12)


In [None]:
y_test

46509    0
70865    0
11085    0
83142    1
82317    0
        ..
4368     1
37014    1
11208    0
84456    1
57035    0
Name: Status, Length: 35000, dtype: int64

In [None]:
def scale_data(X_train, X_test):
    """
    Input: Features (numpy arrays)
    Output: Scaled data
    """
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled
  
X_train_scaled, X_test_scaled = scale_data(X_train, X_test)

In [108]:

def model_score(model_name, model, X_train,
                X_test, y_train, y_test, test=False):
    """
    Input: Transformed feature and target sets
    Output: If test=True, test scores
    """

    my_model = model
    my_model.fit(X_train,y_train)
    y_pred = my_model.predict(X_test)
    if test:
        print("Calculating test score...")
        print(f'{model_name} accuracy score: {my_model.score(X_test,y_test):.4}\n')
        print(f'Classification Report \n {classification_report(y_test, y_pred)}')

    return my_model


In [109]:
knn = model_score('KNN', KNeighborsClassifier(n_neighbors=5),
                        X_train_scaled, X_test_scaled,
                        y_train, y_test,test=True)

lr = model_score('LogReg', LogisticRegression(penalty='none'),
                       X_train_scaled, X_test_scaled,
                       y_train, y_test,test=True)


Calculating test score...
KNN accuracy score: 0.9993

Classification Report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     18303
           1       1.00      1.00      1.00     16697

    accuracy                           1.00     35000
   macro avg       1.00      1.00      1.00     35000
weighted avg       1.00      1.00      1.00     35000

Calculating test score...
LogReg accuracy score: 1.0

Classification Report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     18303
           1       1.00      1.00      1.00     16697

    accuracy                           1.00     35000
   macro avg       1.00      1.00      1.00     35000
weighted avg       1.00      1.00      1.00     35000



In [110]:
gbm = model_score('XGBoost', xgb.XGBClassifier(),
                       X_train_scaled, X_test_scaled,
                       y_train, y_test,test=True)

nb = model_score('Gaussian', GaussianNB(), 
                        X_train_scaled, X_test_scaled, y_train, 
                        y_test, test = True)


Calculating test score...
XGBoost accuracy score: 1.0

Classification Report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     18303
           1       1.00      1.00      1.00     16697

    accuracy                           1.00     35000
   macro avg       1.00      1.00      1.00     35000
weighted avg       1.00      1.00      1.00     35000

Calculating test score...
Gaussian accuracy score: 1.0

Classification Report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     18303
           1       1.00      1.00      1.00     16697

    accuracy                           1.00     35000
   macro avg       1.00      1.00      1.00     35000
weighted avg       1.00      1.00      1.00     35000



In [111]:
svc = model_score('SVM', SVC(probability=True),
                       X_train_scaled, X_test_scaled,
                       y_train, y_test,test=True)

rf = model_score('RF', RandomForestClassifier(),
                       X_train_scaled, X_test_scaled,
                       y_train, y_test,test=True)

Calculating test score...
SVM accuracy score: 1.0

Classification Report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     18303
           1       1.00      1.00      1.00     16697

    accuracy                           1.00     35000
   macro avg       1.00      1.00      1.00     35000
weighted avg       1.00      1.00      1.00     35000

Calculating test score...
RF accuracy score: 1.0

Classification Report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     18303
           1       1.00      1.00      1.00     16697

    accuracy                           1.00     35000
   macro avg       1.00      1.00      1.00     35000
weighted avg       1.00      1.00      1.00     35000



### Saving Data

In [98]:
df.to_csv('../data/cleaned_auction_data.csv')

In [101]:
joblib.dump(nb, '../model/auction_model.pkl')

['../model/auction_model.pkl']