In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, MinMaxScaler, RobustScaler
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from pandas import DataFrame

!pip install rgf_python
from rgf.sklearn import RGFClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
!pip install catboost
from catboost import CatBoostClassifier

Collecting rgf_python
[?25l  Downloading https://files.pythonhosted.org/packages/7d/b5/9ba527ce20a1a5a6c279318be856aa4996e0d1b59035d466173198ffd468/rgf_python-3.9.0-py2.py3-none-manylinux1_x86_64.whl (757kB)
[K     |████████████████████████████████| 768kB 8.6MB/s 
Installing collected packages: rgf-python
Successfully installed rgf-python-3.9.0
Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/7e/c1/c1c4707013f9e2f8a96899dd3a87f66c9167d6d776a6dc8fe7ec8678d446/catboost-0.24.3-cp36-none-manylinux1_x86_64.whl (66.3MB)
[K     |████████████████████████████████| 66.3MB 54kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.3


In [2]:
train= pd.read_csv("/content/drive/MyDrive/Zindi Folder/Train.csv")
test= pd.read_csv("/content/drive/MyDrive/Zindi Folder/Test.csv")
sample= pd.read_csv("/content/drive/MyDrive/Zindi Folder/SampleSubmission.csv")

# Filling Missing Values

In [3]:
train['Gender']= train['Gender'].fillna('other')
test['Gender']= test['Gender'].fillna('other')

In [4]:
train['Car_Category']= train['Car_Category'].fillna('Saloon')
test['Car_Category']= test['Car_Category'].fillna('Saloon')

In [5]:
train['Subject_Car_Colour']= train['Subject_Car_Colour'].fillna('Black')
test['Subject_Car_Colour']= test['Subject_Car_Colour'].fillna('Black')

In [6]:
train['Subject_Car_Make']= train['Subject_Car_Make'].fillna('Toyota')
test['Subject_Car_Make']= test['Subject_Car_Make'].fillna('Toyota')

In [7]:
train['LGA_Name']= train['LGA_Name'].fillna('Victoria Island')
test['LGA_Name']= test['LGA_Name'].fillna('Victoria Island')

In [8]:
train['State']= train['State'].fillna('Lagos')
test['State']= test['State'].fillna('Lagos')

In [9]:
train.isnull().sum()

ID                        0
Policy Start Date         0
Policy End Date           0
Gender                    0
Age                       0
First Transaction Date    0
No_Pol                    0
Car_Category              0
Subject_Car_Colour        0
Subject_Car_Make          0
LGA_Name                  0
State                     0
ProductName               0
target                    0
dtype: int64

# Datetime Features

In [10]:
from datetime import datetime

In [11]:
train["Policy Start Date"]= pd.to_datetime(train["Policy Start Date"])
train["Policy End Date"]= pd.to_datetime(train["Policy End Date"])
train["First Transaction Date"]= pd.to_datetime(train["First Transaction Date"])

In [12]:
test["Policy Start Date"]= pd.to_datetime(test["Policy Start Date"])
test["Policy End Date"]= pd.to_datetime(test["Policy End Date"])
test["First Transaction Date"]= pd.to_datetime(test["First Transaction Date"])

In [13]:
train["PSD"]= [datetime.timestamp(i) for i in train["Policy Start Date"]]
train["PED"]= [datetime.timestamp(i) for i in train["Policy End Date"]]
train["FTD"]= [datetime.timestamp(i) for i in train["First Transaction Date"]]

In [14]:
test["PSD"]= [datetime.timestamp(i) for i in test["Policy Start Date"]]
test["PED"]= [datetime.timestamp(i) for i in test["Policy End Date"]]
test["FTD"]= [datetime.timestamp(i) for i in test["First Transaction Date"]]

# Scaling Age Column

In [15]:
data= train['Age']
rs= RobustScaler()
train['Age']= DataFrame(rs.fit_transform(data.values.reshape(-1,1)))

In [16]:
dat= test['Age']
test['Age']= DataFrame(rs.transform(dat.values.reshape(-1,1)))

# Gender Column EDA

In [17]:
#train['Gender']= train['Gender'].replace(['Entity': 'other', 'Joint Gender': 'other', 'NOT STATED': 'other', 'SEX': 'other', 'NO GENDER': 'other']) 

# Categorical Data Conversion

In [18]:
cat= ('Gender', 'Car_Category', 'Subject_Car_Colour',
       'Subject_Car_Make', 'LGA_Name', 'State', 'ProductName')
for x in cat:
  train[x]= train[x].astype(str)

In [19]:
cats= ( 'Gender','Car_Category', 'Subject_Car_Colour',
       'Subject_Car_Make', 'LGA_Name', 'State', 'ProductName')
for y in cats:
  test[y]= test[y].astype(str)

In [20]:
col= ( 'Gender','Car_Category', 'Subject_Car_Colour',
       'Subject_Car_Make', 'LGA_Name', 'State', 'ProductName')
for x in col:
  le= LabelEncoder()
  train[x]= le.fit_transform(train[x].values)

In [21]:
cols= ( 'Gender', 'Car_Category', 'Subject_Car_Colour',
       'Subject_Car_Make', 'LGA_Name', 'State', 'ProductName')
for y in cols:
  le= LabelEncoder()
  test[y]= le.fit_transform(test[y].values)

In [22]:
train.dtypes

ID                                object
Policy Start Date         datetime64[ns]
Policy End Date           datetime64[ns]
Gender                             int64
Age                              float64
First Transaction Date    datetime64[ns]
No_Pol                             int64
Car_Category                       int64
Subject_Car_Colour                 int64
Subject_Car_Make                   int64
LGA_Name                           int64
State                              int64
ProductName                        int64
target                             int64
PSD                              float64
PED                              float64
FTD                              float64
dtype: object

# Dealing With Imbalance Dataset Using Up Sample

In [23]:
train['target'].value_counts()

0    10624
1     1455
Name: target, dtype: int64

In [24]:
tr_majority = train[train.target==0]
tr_minority = train[train.target==1]

In [25]:
from sklearn.utils import resample

In [26]:
tr_minority_upsampled = resample(tr_minority, 
                                 replace=True,     
                                 n_samples=10624,  
                                 random_state=123)

tr_upsampled = pd.concat([tr_majority, tr_minority_upsampled])

tr_upsampled.target.value_counts()

1    10624
0    10624
Name: target, dtype: int64

In [27]:
tr_upsampled.head()

Unnamed: 0,ID,Policy Start Date,Policy End Date,Gender,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,target,PSD,PED,FTD
0,0040R73,2010-05-14,2011-05-13,3,-0.733333,2010-05-14,1,8,4,67,247,73,1,0,1273795000.0,1305245000.0,1273795000.0
2,005QMC3,2010-03-21,2011-03-20,3,0.133333,2010-03-21,1,8,32,67,247,73,1,0,1269130000.0,1300579000.0,1269130000.0
3,0079OHW,2010-08-21,2011-08-20,3,-2.6,2010-08-21,1,8,4,69,247,73,5,0,1282349000.0,1313798000.0,1282349000.0
5,00D3EF6,2010-10-21,2011-10-20,3,-0.266667,2010-10-21,2,8,4,69,247,73,1,0,1287619000.0,1319069000.0,1287619000.0
6,00HHZ8Y,2010-08-02,2011-08-01,1,-0.066667,2010-08-02,3,8,4,69,115,73,0,0,1280707000.0,1312157000.0,1280707000.0


In [28]:
test.head()

Unnamed: 0,ID,Policy Start Date,Policy End Date,Gender,Age,First Transaction Date,No_Pol,Car_Category,Subject_Car_Colour,Subject_Car_Make,LGA_Name,State,ProductName,PSD,PED,FTD
0,009D84L,2010-04-24,2011-03-27,4,-1.066667,2010-04-24,1,7,0,22,185,52,0,1272067000.0,1301184000.0,1272067000.0
1,01DO2EQ,2010-01-01,2010-12-31,7,5.266667,2010-01-01,4,7,0,49,185,52,8,1262304000.0,1293754000.0,1262304000.0
2,01QM0NU,2010-10-23,2011-10-22,1,0.333333,2010-10-23,1,7,4,13,11,6,1,1287792000.0,1319242000.0,1287792000.0
3,024NJLZ,2010-10-14,2011-10-13,3,-0.6,2010-10-14,1,7,4,50,113,14,1,1287014000.0,1318464000.0,1287014000.0
4,02BYET3,2010-09-16,2010-12-31,7,5.266667,2010-09-16,4,7,4,49,185,52,8,1284595000.0,1293754000.0,1284595000.0


In [29]:
y = tr_upsampled.target
X = tr_upsampled.drop(['target', 'ID', 'Policy Start Date', 'First Transaction Date', 'Policy End Date'], axis=1)

In [30]:
test= test.drop(['ID', 'Policy Start Date', 'First Transaction Date', 'Policy End Date'], axis= 1)

In [31]:
train_x, val_x, train_y, val_y= train_test_split(X,y, test_size= 0.4, random_state= 999)

In [32]:
#rfc= RandomForestClassifier(n_estimators= 500)
rf= RandomForestClassifier(n_estimators= 300)
#rfm= RGFClassifier()
rf.fit(train_x, train_y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [34]:
threshold = 0.4

predicted_proba = rf.predict_proba(val_x)
predicted = (predicted_proba [:,1] >= threshold).astype('int')

accuracy = f1_score(val_y, predicted)
print(accuracy)

0.9149824253075571


In [35]:
pred= rf.predict(val_x)
print('Score', f1_score(val_y, pred))

Score 0.9359783588818756


In [37]:
print(classification_report(val_y, predicted))

              precision    recall  f1-score   support

           0       0.99      0.83      0.90      4307
           1       0.85      0.99      0.91      4193

    accuracy                           0.91      8500
   macro avg       0.92      0.91      0.91      8500
weighted avg       0.92      0.91      0.91      8500



In [38]:
rf.fit(X,y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [39]:
predicted_pro = rf.predict_proba(test)
predictin = (predicted_pro [:,1] >= threshold).astype('int')


In [40]:
sample['target']= predictin
sample.head()

Unnamed: 0,ID,target
0,009D84L,1
1,01DO2EQ,1
2,01QM0NU,0
3,024NJLZ,0
4,02BYET3,1


In [41]:
sample.to_csv('ikmsr.csv', index = False)

In [42]:
sample.head()

Unnamed: 0,ID,target
0,009D84L,1
1,01DO2EQ,1
2,01QM0NU,0
3,024NJLZ,0
4,02BYET3,1
