In [None]:
import pandas as pd
from pandas import read_csv
from scipy import linalg
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import scale
import numpy as np
import matplotlib as mpl
import scipy
from sklearn.metrics import classification_report
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold, KFold, GridSearchCV
from sklearn.metrics import auc
from sklearn.metrics import roc_curve

from sklearn import metrics, mixture, cluster, datasets
from sklearn.mixture import GaussianMixture
from sklearn.calibration import CalibratedClassifierCV
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

In [None]:
train = pd.read_csv("../input/trainfinal/train_final.csv")

In [None]:
train.loc[train['Gender'] == 'Male', 'Gender'] = 1
train.loc[train['Gender'] == 'Female', 'Gender'] = 0

train.loc[train['Vehicle_Age'] == '> 2 Years', 'Vehicle_Age'] = 2
train.loc[train['Vehicle_Age'] == '1-2 Year', 'Vehicle_Age'] = 1
train.loc[train['Vehicle_Age'] == '< 1 Year', 'Vehicle_Age'] = 0

train.loc[train['Vehicle_Damage'] == 'Yes', 'Vehicle_Damage'] = 1
train.loc[train['Vehicle_Damage'] == 'No', 'Vehicle_Damage'] = 0

In [None]:
train = train.drop("id",axis=1)
train.head()

## Correlation check and heatmap

In [None]:
corr=train.corr()
plt.figure(figsize=(30,15))
sns.heatmap(corr,annot=True, cmap="coolwarm")
plt.show()
# From the heatmap, we could see the relationships between 'Response' and other factors.
# Firstly, the relation between 'Vehicle_Damage' and it is the strongest, -0.35 and 0.35.
# Meanwhile, the relation between 'Previously insured' and it is also remarkbale, while it is negative, at -0.34

# Except for response, negative relations between age and previously insured means that young people tend to hold insurance.

## Clustering

### 1. GM-business cluster

In [None]:
df_business = train[['Region_Code','Annual_Premium','Policy_Sales_Channel','Vintage']]
X = df_business.values

GM_n_components = np.arange(1, 8)
GM_models = [mixture.GaussianMixture(n, covariance_type='full', random_state=0).fit(X) for n in GM_n_components]

plt.figure(num=None, figsize=(8, 6), dpi=60, facecolor='w', edgecolor='r')
plt.plot(GM_n_components, [m.aic(X) for m in GM_models], label='AIC')
plt.tight_layout()
plt.legend(loc='best')
plt.xlabel('n_components');

In [None]:
# Finally, k equals to 2

y = train['Response']
from sklearn.model_selection import train_test_split
from sklearn.metrics import adjusted_rand_score

X_train,X_test,y_train,y_test = train_test_split(df_business,y,test_size=0.25,random_state=0,stratify=y)


GM_n_classes = 2

GMcluster = mixture.GaussianMixture(n_components=GM_n_classes, covariance_type='full',random_state = 0)
GMcluster_fit = GMcluster.fit(df_business)
GMlabels = GMcluster_fit.predict(df_business)
y_pred = GMcluster_fit.predict(X_test)

print(adjusted_rand_score(y_test, y_pred))

### 2. Sihouette score-Business cluster

In [None]:
attributes = df_business
cluster_labels = GMlabels

from sklearn.metrics import silhouette_score
silhouette_score(attributes, cluster_labels)

In [None]:
KMlabels = KM.predict(df_business)
attributes = df_business
cluster_labels = KMlabels

from sklearn.metrics import silhouette_score
silhouette_score(attributes, cluster_labels)

### 3. Kmeans-business cluster

In [None]:
from sklearn.cluster import KMeans
# Create a list of SSE
sse = []
# We decide to try k from 1 to 14, to see which value is the best number of clusters.
for k in range(1,15):
    # Using kmeans method to fit the data and compute clustering.
    kmeans = KMeans(n_clusters = k)
    kmeans.fit(train)
    sse.append(kmeans.inertia_)
print(sse)

In [None]:
plt.plot(range(1,15), sse,marker = "o")
plt.title('Elbow method')
plt.xlabel('No of clusters')
plt.ylabel('SSE')
plt.show()

In [None]:
X = df_business
y = train['Response']
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(df_business,y,test_size=0.25,random_state=0,stratify=y)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
km = KMeans(n_clusters=2,init='k-means++',n_init=10,max_iter=300,tol=1e-04,random_state=0)

KM = km.fit(df_business)

y_pred_business = km.predict(X_test)

### 4. GM-Client cluster

In [None]:
VariablesClient = [x for x in train if x not in df_business.columns]
df_client = train[VariablesClient]
X1 = df_client.values
GMcluster_fit = GMcluster.fit(df_client)
GMlabels = GMcluster_fit.predict(df_client)

y = train['Response']
from sklearn.model_selection import train_test_split
from sklearn.metrics import adjusted_rand_score

X1_train,X1_test,y_train,y_test = train_test_split(df_client,y,test_size=0.25,random_state=0,stratify=y)


GM_n_classes = 2


y_pred = GMcluster_fit.predict(X1_test)

X1_train,X1_test,y_train,y_test = train_test_split(df_client,y,test_size=0.25,random_state=0,stratify=y)
X1_train.shape,X1_test.shape,y_train.shape,y_test.shape

### 5. KMeans-client cluster

In [None]:
km.fit(df_client)

y_pred_client = km.predict(X1_test)

KMlabels_client= km.predict(X1)

### 6. Sihouette score-Client cluster

In [None]:
attributes = df_client
cluster_labels = GMlabels

from sklearn.metrics import silhouette_score
silhouette_score(attributes, cluster_labels)

In [None]:
KMlabels = KM.predict(df_client)
attributes = df_client
cluster_labels = KMlabels

from sklearn.metrics import silhouette_score
silhouette_score(attributes, cluster_labels)

## Get dummy variables

In [None]:
train["Region_Code"].head(10)

In [None]:
# get dummy variables for "Region_Code"
dummy_Region_Code = pd.get_dummies(train["Region_Code"],prefix = "Region_Code")
train = pd.concat([train.drop('Region_Code',axis=1),dummy_Region_Code],axis=1)

In [None]:
train["Policy_Sales_Channel"].head(10)

In [None]:
# get dummy variables for "Policy_Sales_Channel"
dummy_Policy_Sales_Channel = pd.get_dummies(train["Policy_Sales_Channel"],prefix = "Policy_Sales_Channel")
train = pd.concat([train.drop('Policy_Sales_Channel',axis=1),dummy_Policy_Sales_Channel],axis=1)

In [None]:
train

## Descriptive analysis

In [None]:
sns.countplot(x = train.Response)

In [None]:
sns.distplot(train.Age)

In [None]:
sns.boxplot(y = 'Age', data = train,palette='Accent')

In [None]:
sns.scatterplot(x=train['Age'],y=train['Annual_Premium'])

In [None]:
sns.countplot(train.Gender)

In [None]:
df=train.groupby(['Gender'])['Driving_License'].count().to_frame().reset_index()

In [None]:
sns.catplot(x="Gender", y="Driving_License",data=df, kind="bar")

## Train and Test Split

In [None]:
X = train.drop('Response',axis=1)
y = train['Response']

from sklearn.model_selection import train_test_split as split
X_train, X_test, y_train, y_test = split(X,y, test_size=0.25, stratify=y, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Normalization and PCA

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [None]:
X_train_scaled.shape, X_test_scaled.shape

In [None]:
y_train=y_train.astype("float")
y_test=y_test.astype("float")

In [None]:
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
import mglearn

In [None]:
# Use the numpy.cov function, we computed the covariance matrix of the standardized training dataset.
cov_mat = np.cov(X_train_scaled.T)
# Use the linalg.eig function, we can get a vector (eigen_vals) consisting of the eigenvalues and the corresponding eigenvectors.
eigen_vals, eigen_vecs = np.linalg.eig(cov_mat)

#print('\nEigenvalues \n%s' % eigen_vals)

In [None]:
# We use the NumPy cumsum function to calculate the cumulative sum of explained variances.

tot = sum(eigen_vals)
var_exp = [(i / tot) for i in sorted(eigen_vals, reverse=True)]
cum_var_exp = np.cumsum(var_exp)

In [None]:
cum_var_exp[:40]
# According to the above plot, we chose 36 as the component of PCA that reached above 90% explained variance ratio.

In [None]:
# Then we can draw the the plot of the cumulative sum of explained variances.

import matplotlib.pyplot as plt

plt.bar(range(1, 217), var_exp, alpha=0.5, align='center', label='individual explained variance')
plt.step(range(1, 217), cum_var_exp, where='mid', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

In [None]:
pca = PCA(n_components=36) # According to the elbow plot, we can choose n=50.
pcaX_train_scaled = pca.fit_transform(X_train_scaled)
pcaX_test_scaled = pca.fit_transform(X_test_scaled)

## PCA Reconstruction Error

In [None]:
components = [30,36,40]    
for n in components:
    pca = PCA(n_components=n)
    recon = pca.inverse_transform(pca.fit_transform(X_train_scaled))
    rmse = mean_squared_error(X_train_scaled[0], recon[0],squared=False)
    print("RMSE: {} with {} components".format(rmse, n))

## GaussianNB

In [None]:
from sklearn.naive_bayes import GaussianNB
model1 = GaussianNB()
model1.fit(pcaX_train_scaled, y_train)
y_pred1 = model1.predict(pcaX_test_scaled)

print("Classification Report:\n ", classification_report(y_test, y_pred1))
print("\nConfusion Matrix:\n",metrics.confusion_matrix(y_test, y_pred1))

In [None]:
y_pred11 = model1.predict_proba(pcaX_test_scaled)
(fpr, tpr, thresholds) = roc_curve(y_test,y_pred11[:,1])
roc_auc = auc(fpr,tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr,lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

## BernoulliNB

In [None]:
from sklearn.naive_bayes import BernoulliNB
model17 = BernoulliNB()
model17.fit(pcaX_train_scaled, y_train)
y_pred17 = model17.predict(pcaX_test_scaled)

print("Classification Report:\n ", classification_report(y_test, y_pred17))
print("\nConfusion Matrix:\n",metrics.confusion_matrix(y_test, y_pred17))

In [None]:
y_pred171 = model17.predict_proba(pcaX_test_scaled)
(fpr, tpr, thresholds) = roc_curve(y_test,y_pred171[:,1])
roc_auc = auc(fpr,tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr,lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

## LDA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
model2 = lda.fit(pcaX_train_scaled, y_train)
y_pred2 = model2.predict(pcaX_test_scaled)
print("Classification Report:\n ", classification_report(y_test, y_pred2))
print("\nConfusion Matrix:\n",metrics.confusion_matrix(y_test, y_pred2))

In [None]:
y_pred21 = model2.predict_proba(pcaX_test_scaled)
(fpr, tpr, thresholds) = roc_curve(y_test,y_pred21[:,1])
roc_auc = auc(fpr,tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr,lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

## Logistic Regression

In [None]:
random_search = {'C':[0.001, 1, 100],
          'class_weight':['balanced', None],
          'solver':['liblinear','sag','lbfgs','newton-cg']}
grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=random_search, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_result=grid_search.fit(X_train_scaled, y_train)
print(f'Best:{grid_result.best_score_}using{grid_result.best_params_}','\n')

In [None]:
from sklearn.linear_model import LogisticRegression
model3 = LogisticRegression(C=0.001,solver="sag")
model3.fit(pcaX_train_scaled, y_train)
y_pred3 = model3.predict(pcaX_test_scaled)

print("Classification Report:\n ", classification_report(y_test, y_pred3))
print("\nConfusion Matrix:\n",metrics.confusion_matrix(y_test, y_pred3))

In [None]:
y_pred31 = model3.predict_proba(pcaX_test_scaled)
(fpr, tpr, thresholds) = roc_curve(y_test,y_pred31[:,1])
roc_auc = auc(fpr,tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr,lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

## SVM (Non-Linear)

In [None]:
%%time
from sklearn.svm import SVC
random_search = {'C': [0.1, 1, 10], 'gamma': [0.01, 0.1, 1]}
grid_search = GridSearchCV(estimator=SVC(), param_grid=random_search, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_result=grid_search.fit(X_train_scaled, y_train)
print(f'Best:{grid_result.best_score_}using{grid_result.best_params_}','\n')

In [None]:
%%time
from sklearn.svm import SVC
model4 =SVC()
model4.fit(X_train_scaled, y_train)
y_pred4 = model4.predict(X_test_scaled)

print("Classification Report:\n ", classification_report(y_test, y_pred4))

In [None]:
# This model tooks us more than 3 days to run, but we still cannot get the result.

## LinearSVM

In [None]:
from sklearn.svm import LinearSVC
model19 = LinearSVC()
model19.fit(pcaX_train_scaled, y_train)
y_pred19 = model19.predict(pcaX_test_scaled)

print("Classification Report:\n ", classification_report(y_test, y_pred19))

## Perceptron

In [None]:
from sklearn.linear_model import Perceptron
model5=Perceptron()
model5.fit(pcaX_train_scaled, y_train)
y_pred5 = model5.predict(pcaX_test_scaled)

print("Classification Report:\n ", classification_report(y_test, y_pred5))
print("\nConfusion Matrix:\n",metrics.confusion_matrix(y_test, y_pred5))

## Multi-layer Perceptron classifier

In [None]:
from sklearn.neural_network import MLPClassifier
model6 = MLPClassifier(hidden_layer_sizes=(400,100),alpha=0.01,max_iter=1000) 
model6.fit(pcaX_train_scaled, y_train) 
y_pred6 = model6.predict(pcaX_test_scaled)

print("Classification Report:\n ", classification_report(y_test, y_pred6))

In [None]:
y_pred61 = model6.predict_proba(pcaX_test_scaled)
(fpr, tpr, thresholds) = roc_curve(y_test,y_pred61[:,1])
roc_auc = auc(fpr,tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr,lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

## Random Forest

In [None]:
random_search = {'criterion': ['entropy', 'gini'],
               'max_depth': [2,3,4,5,6,7,10],
               'min_samples_leaf': [4, 6, 8],
               'min_samples_split': [5, 7,10],
               'n_estimators': [300]}
grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=random_search, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
grid_result=grid_search.fit(X_train_scaled, y_train)
print(f'Best:{grid_result.best_score_}using{grid_result.best_params_}','\n')

In [None]:
from sklearn.ensemble import RandomForestClassifier
model7 = RandomForestClassifier(criterion='entropy',n_estimators=300,max_depth=2,min_samples_leaf=4,min_samples_split=5)
model7.fit(X_train_scaled, y_train) 
y_pred7 = model7.predict(X_test_scaled)

print("Classification Report:\n ", classification_report(y_test, y_pred7))

In [None]:
y_pred71 = model7.predict_proba(X_test_scaled)
(fpr, tpr, thresholds) = roc_curve(y_test,y_pred71[:,1])
roc_auc = auc(fpr,tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr,lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
model18 = DecisionTreeClassifier(random_state=0)
model18.fit(X_train_scaled, y_train) 
y_pred18 = model18.predict(X_test_scaled)

print("Classification Report:\n ", classification_report(y_test, y_pred18))

In [None]:
y_pred181 = model18.predict_proba(X_test_scaled)
(fpr, tpr, thresholds) = roc_curve(y_test,y_pred181[:,1])
roc_auc = auc(fpr,tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr,lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

## LightGBM

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
model14= GradientBoostingClassifier(random_state=1)             
model14.fit(X_train_scaled, y_train)           
y_pred14 = model14.predict(X_test_scaled)
print("Classification Report:\n ", classification_report(y_test, y_pred14))

In [None]:
y_pred141 = model14.predict_proba(X_test_scaled)
(fpr, tpr, thresholds) = roc_curve(y_test,y_pred141[:,1])
roc_auc = auc(fpr,tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr,lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
import sklearn.tree as st
import sklearn.ensemble as se
model10=AdaBoostClassifier(n_estimators=150, random_state=0)
model10.fit(X_train_scaled, y_train) 
y_pred10 = model10.predict(X_test_scaled)

print("Classification Report:\n ", classification_report(y_test, y_pred10))

In [None]:
y_pred101 = model10.predict_proba(X_test_scaled)
(fpr, tpr, thresholds) = roc_curve(y_test,y_pred101[:,1])
roc_auc = auc(fpr,tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr,lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

## Catboost

In [None]:
%%time
from catboost import CatBoostClassifier
model16 = CatBoostClassifier()
model16.fit(X_train_scaled, y_train)           
y_pred16 = model16.predict(X_test_scaled)
print("Classification Report:\n ", classification_report(y_test, y_pred16))

In [None]:
y_pred161 = model16.predict_proba(X_test_scaled)
(fpr, tpr, thresholds) = roc_curve(y_test,y_pred161[:,1])
roc_auc = auc(fpr,tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr, 
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

## XGBoost

In [None]:
%%time
from xgboost import XGBClassifier
model13 = XGBClassifier(n_jobs=-1)              
model13.fit(X_train_scaled, y_train)           
y_pred13 = model13.predict(X_test_scaled)
print("Classification Report:\n ", classification_report(y_test, y_pred13))

In [None]:
y_pred131 = model13.predict_proba(X_test_scaled)
(fpr, tpr, thresholds) = roc_curve(y_test,y_pred131[:,1])
roc_auc = auc(fpr,tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr, 
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

## SMOTE

In [None]:
#pip install --user imbalanced-learn
#pip install delay

In [None]:
from imblearn.over_sampling import SMOTE

over_samples = SMOTE(random_state=0) 
X_train_smo, y_train_smo = over_samples.fit_resample(pcaX_train_scaled, y_train)

print(y_train.value_counts()/len(y_train))
print(pd.Series(y_train_smo).value_counts()/len(y_train_smo))

### LDA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
model2 = lda.fit(X_train_smo, y_train_smo)
y_pred_smo = model2.predict(pcaX_test_scaled)
print("Classification Report:\n ", classification_report(y_test,y_pred_smo))
print("\nConfusion Matrix:\n",metrics.confusion_matrix(y_test,y_pred_smo))

In [None]:
y_pred21 = model2.predict_proba(pcaX_test_scaled)
(fpr, tpr, thresholds) = roc_curve(y_test,y_pred21[:,1])
roc_auc = auc(fpr,tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr,lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

### LR

In [None]:
from sklearn.linear_model import LogisticRegression
model3 = LogisticRegression(C=0.001,solver="sag")
model3.fit(X_train_smo, y_train_smo)
y_pred3 = model3.predict(pcaX_test_scaled)

print("Classification Report:\n ", classification_report(y_test, y_pred3))
print("\nConfusion Matrix:\n",metrics.confusion_matrix(y_test, y_pred3))

In [None]:
y_pred131 = model3.predict_proba(pcaX_test_scaled)
(fpr, tpr, thresholds) = roc_curve(y_test,y_pred131[:,1])
roc_auc = auc(fpr,tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr, 
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.naive_bayes import GaussianNB
model1 = GaussianNB()
model1.fit(X_train_smo, y_train_smo)
y_pred1 = model1.predict(pcaX_test_scaled)

print("Classification Report:\n ", classification_report(y_test, y_pred1))
print("\nConfusion Matrix:\n",metrics.confusion_matrix(y_test, y_pred1))

In [None]:
y_pred11 = model1.predict_proba(pcaX_test_scaled)
(fpr, tpr, thresholds) = roc_curve(y_test,y_pred11[:,1])
roc_auc = auc(fpr,tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr, 
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

## Over Sampling

In [None]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

ros = RandomOverSampler(0.9)

X_train_ros, y_train_ros = ros.fit_resample(pcaX_train_scaled, y_train)

print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_ros)))

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
model2ros = lda.fit(X_train_ros, y_train_ros)
y_pred_ros = model2ros.predict(pcaX_test_scaled)
print("Classification Report:\n ", classification_report(y_test,y_pred_ros))
print("\nConfusion Matrix:\n",metrics.confusion_matrix(y_test,y_pred_ros))

In [None]:
y_pred21 = model2ros.predict_proba(pcaX_test_scaled)
(fpr, tpr, thresholds) = roc_curve(y_test,y_pred21[:,1])
roc_auc = auc(fpr,tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr,lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression
model3 = LogisticRegression(C=0.001,solver="sag")
model3.fit(X_train_ros, y_train_ros)
y_pred3 = model3.predict(pcaX_test_scaled)

print("Classification Report:\n ", classification_report(y_test, y_pred3))
print("\nConfusion Matrix:\n",metrics.confusion_matrix(y_test, y_pred3))

In [None]:
y_pred131 = model3.predict_proba(pcaX_test_scaled)
(fpr, tpr, thresholds) = roc_curve(y_test,y_pred131[:,1])
roc_auc = auc(fpr,tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr, 
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.naive_bayes import GaussianNB
model1 = GaussianNB()
model1.fit(X_train_ros, y_train_ros)
y_pred1 = model1.predict(pcaX_test_scaled)

print("Classification Report:\n ", classification_report(y_test, y_pred1))
print("\nConfusion Matrix:\n",metrics.confusion_matrix(y_test, y_pred1))

In [None]:
y_pred11 = model1.predict_proba(pcaX_test_scaled)
(fpr, tpr, thresholds) = roc_curve(y_test,y_pred11[:,1])
roc_auc = auc(fpr,tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr, 
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

## Under sampling

In [None]:
from imblearn.under_sampling import NearMiss
ns=NearMiss(0.9)

X_train_ns,y_train_ns=ns.fit_resample(pcaX_train_scaled, y_train)

print("The number of classes before fit {} ".format(Counter(y_train)))
print("The number of classes after fit {} ".format(Counter(y_train_ns)))

In [None]:
from sklearn.naive_bayes import GaussianNB
model1 = GaussianNB()
model1.fit(X_train_ns, y_train_ns)
y_pred1 = model1.predict(pcaX_test_scaled)

print("Classification Report:\n ", classification_report(y_test, y_pred1))
print("\nConfusion Matrix:\n",metrics.confusion_matrix(y_test, y_pred1))

In [None]:
y_pred11 = model1.predict_proba(pcaX_test_scaled)
(fpr, tpr, thresholds) = roc_curve(y_test,y_pred11[:,1])
roc_auc = auc(fpr,tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr,lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression
model3 = LogisticRegression(C=0.001,solver="sag")
model3.fit(X_train_ns, y_train_ns)
y_pred3 = model3.predict(pcaX_test_scaled)

print("Classification Report:\n ", classification_report(y_test, y_pred3))
print("\nConfusion Matrix:\n",metrics.confusion_matrix(y_test, y_pred3))

In [None]:
y_pred131 = model3.predict_proba(pcaX_test_scaled)
(fpr, tpr, thresholds) = roc_curve(y_test,y_pred131[:,1])
roc_auc = auc(fpr,tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr, 
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.naive_bayes import GaussianNB
model1 = GaussianNB()
model1.fit(X_train_ns, y_train_ns)
y_pred1 = model1.predict(pcaX_test_scaled)

print("Classification Report:\n ", classification_report(y_test, y_pred1))
print("\nConfusion Matrix:\n",metrics.confusion_matrix(y_test, y_pred1))

In [None]:
y_pred11 = model1.predict_proba(pcaX_test_scaled)
(fpr, tpr, thresholds) = roc_curve(y_test,y_pred11[:,1])
roc_auc = auc(fpr,tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr, 
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

## SMOTETomek

In [None]:
%%time
from imblearn.combine import SMOTETomek
kos = SMOTETomek(random_state=0)
X_train_kos, y_train_kos = kos.fit_resample(pcaX_train_scaled, y_train)

print("The number of classes before fit {} ".format(Counter(y_train)))
print("The number of classes after fit {} ".format(Counter(y_train_kos)))

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
model2kos = lda.fit(X_train_kos, y_train_kos)
y_pred_kos = model2kos.predict(pcaX_test_scaled)
print("Classification Report:\n ", classification_report(y_test,y_pred_kos))
print("\nConfusion Matrix:\n",metrics.confusion_matrix(y_test,y_pred_kos))

In [None]:
y_pred_kos = model2kos.predict_proba(pcaX_test_scaled)
(fpr, tpr, thresholds) = roc_curve(y_test,y_pred_kos[:,1])
roc_auc = auc(fpr,tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr,lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression
model3 = LogisticRegression(C=0.001,solver="sag")
model3.fit(X_train_kos, y_train_kos)
y_pred3 = model3.predict(pcaX_test_scaled)

print("Classification Report:\n ", classification_report(y_test, y_pred3))
print("\nConfusion Matrix:\n",metrics.confusion_matrix(y_test, y_pred3))

In [None]:
y_pred131 = model3.predict_proba(pcaX_test_scaled)
(fpr, tpr, thresholds) = roc_curve(y_test,y_pred131[:,1])
roc_auc = auc(fpr,tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr, 
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.naive_bayes import GaussianNB
model1 = GaussianNB()
model1.fit(X_train_kos, y_train_kos)
y_pred1 = model1.predict(pcaX_test_scaled)

print("Classification Report:\n ", classification_report(y_test, y_pred1))
print("\nConfusion Matrix:\n",metrics.confusion_matrix(y_test, y_pred1))

In [None]:
y_pred11 = model1.predict_proba(pcaX_test_scaled)
(fpr, tpr, thresholds) = roc_curve(y_test,y_pred11[:,1])
roc_auc = auc(fpr,tpr)

plt.figure()
lw = 2
plt.plot(fpr, tpr, 
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) 
plt.plot([0, 1], [0, 1], lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()