In [1]:
import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe'

In [2]:
df = pd.read_csv('vehicle_data.csv')
df.head()

Unnamed: 0,id,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Gender,Vehicle_Damage,Vehicle_Age
0,1.0,44.0,1.0,28.0,0.0,40454.0,26.0,217.0,1.0,1,1,2
1,2.0,76.0,1.0,3.0,0.0,33536.0,26.0,183.0,0.0,1,0,1
2,3.0,47.0,1.0,28.0,0.0,38294.0,26.0,27.0,1.0,1,1,2
3,4.0,21.0,1.0,11.0,1.0,28619.0,152.0,203.0,0.0,1,0,0
4,5.0,29.0,1.0,41.0,1.0,27496.0,152.0,39.0,0.0,0,0,0


In [3]:
df.drop(columns=['id'], inplace=True)
df.head()

Unnamed: 0,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Gender,Vehicle_Damage,Vehicle_Age
0,44.0,1.0,28.0,0.0,40454.0,26.0,217.0,1.0,1,1,2
1,76.0,1.0,3.0,0.0,33536.0,26.0,183.0,0.0,1,0,1
2,47.0,1.0,28.0,0.0,38294.0,26.0,27.0,1.0,1,1,2
3,21.0,1.0,11.0,1.0,28619.0,152.0,203.0,0.0,1,0,0
4,29.0,1.0,41.0,1.0,27496.0,152.0,39.0,0.0,0,0,0


In [4]:
from sklearn.preprocessing import MinMaxScaler

In [5]:
cols = ['Vehicle_Damage', 'Vehicle_Age', 'Policy_Sales_Channel', 'Age', 'Gender', 'Previously_Insured']

In [6]:
scaler = MinMaxScaler()
target = df['Response']
ndf = pd.DataFrame(scaler.fit_transform(df.drop(columns=['Response'])), columns=[col for col in df.columns if col != 'Response'])
ndf.head()

Unnamed: 0,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Gender,Vehicle_Damage,Vehicle_Age
0,0.369231,1.0,0.538462,0.0,0.070366,0.154321,0.716263,1.0,1.0,1.0
1,0.861538,1.0,0.057692,0.0,0.057496,0.154321,0.598616,1.0,0.0,0.5
2,0.415385,1.0,0.538462,0.0,0.066347,0.154321,0.058824,1.0,1.0,1.0
3,0.015385,1.0,0.211538,0.02,0.048348,0.932099,0.66782,1.0,0.0,0.0
4,0.138462,1.0,0.788462,0.02,0.046259,0.932099,0.100346,0.0,0.0,0.0


In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(ndf, target, test_size=0.2, random_state=42)

In [8]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

In [None]:
from sklearn.feature_selection import SelectKBest, chi2
best = SelectKBest(score_func=chi2, k='all')
fit = best.fit(ndf, target)
pd.DataFrame({'col': ndf.columns, 'score': fit.scores_}).nlargest(10, 'score')

In [9]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_res, y_train_res)

In [12]:
from sklearn.metrics import accuracy_score
import numpy as np

preds = lr.predict(X_test)
print(accuracy_score(y_test, preds))
np.unique(preds, return_counts=True)

0.6415995381910734


(array([0., 1.]), array([39849, 36373], dtype=int64))

In [13]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train_res, y_train_res)

In [14]:
rf_preds = rf.predict(X_test)
accuracy_score(y_test, rf_preds)

0.833040329563643

In [17]:
import joblib

In [18]:
# joblib.dump(lr, 'lrClassifier.pickle')
joblib.dump(rf, 'rfClassifier.pickle')
joblib.dump(scaler, 'minMaxScaler.pickle')

['minMaxScaler.pickle']

In [None]:
sdf = df.sample(100)
sdf.head()

In [None]:
test_df = pd.DataFrame(scaler.transform(sdf[['Vehicle_Damage', 'Vehicle_Age', 'Policy_Sales_Channel', 'Age', 'Gender', 'Previously_Insured']]), columns=['Vehicle_Damage', 'Vehicle_Age', 'Policy_Sales_Channel', 'Age', 'Gender', 'Previously_Insured'])
test_df.head()

In [None]:
sdf.to_csv('testing2.csv', index=False)

In [None]:
import numpy as np
pred = lr.predict(X_test)
np.unique(pred, return_counts=True)
np.unique(y_train, return_counts=True)

In [None]:
X_train_res.shape, y_train_res.shape

In [None]:
np.unique(y_train_res, return_counts=True)