In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
from matplotlib import cbook, rc_params_from_file, rcParamsDefault
import plotly.express as px

from sklearn.metrics import roc_auc_score
from sklearn import metrics

#classifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score, mean_absolute_error

from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv(r"/kaggle/input/janatahack-crosssell-prediction/Train.csv")
test = pd.read_csv(r"/kaggle/input/janatahack-crosssell-prediction/Test.csv")

In [None]:
print(train.shape)
train.head()

In [None]:
train.info()

In [None]:
print(test.shape)
test.head()

In [None]:
test.info()

In [None]:
train.drop("id", axis = 1, inplace = True)

test.drop("id", axis = 1, inplace = True)

In [None]:
df = pd.concat([train, test],ignore_index=True)
print(train.shape, test.shape, df.shape)

In [None]:
print(df.shape)
df.head()

In [None]:
df.notnull().tail()

In [None]:
df.dropna(how = 'any').shape

In [None]:
df.duplicated().sum()

In [None]:
df.loc[df.duplicated(keep = 'last'), :]

In [None]:
df.loc[df.duplicated(keep = False), :]

In [None]:
df.drop_duplicates(keep = 'first').shape

In [None]:
df.drop_duplicates(keep = 'last').shape

In [None]:
df.drop_duplicates(keep = False).shape

In [None]:
df.drop_duplicates(subset = ['Gender', 'Age', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 
                             'Annual_Premium', 'Policy_Sales_Channel', 'Vintage', 'Response']).shape

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
df.nunique()

In [None]:
df.apply(lambda x: x.dtype)

In [None]:
total_miss = df.isnull().sum()
perc_miss = total_miss/df.isnull().count()*100

missing_data = pd.DataFrame({'Total missing':total_miss,'% missing':perc_miss})

missing_data.sort_values(by='Total missing',ascending=False).head(3)

In [None]:
#Checking for percentage of missing values in each columns
(df.isnull().sum()/len(df))*100

In [None]:
# find the unique values from categorical features
for col in df.select_dtypes(include='object').columns:
    print(col)
    print(df[col].unique())

In [None]:
for column in df.columns:
    print(column,df[column].nunique())

In [None]:
categorical_features = [feature for feature in df.columns if ((df[feature].dtypes=='O') & (feature not in ['deposit']))]
categorical_features

In [None]:
for feature in categorical_features:
    print('The feature is {} and number of categories are {}'.format(feature,len(df[feature].unique())))

In [None]:
numerical_data = df.select_dtypes(include=np.number) # select_dtypes selects data with numeric features
numerical_col = numerical_data.columns 

print("Numeric Features:")
print(numerical_data.head())
print("===="*20)

In [None]:
categorical_data = df.select_dtypes(exclude=np.number) # we will exclude data with numeric features
categorical_col = categorical_data.columns                          # we will store the categorical features in a variable

print("Categorical Features:")
print(categorical_data.head())
print("===="*20)

In [None]:
### numerical 
numerical_cols = list(df.select_dtypes(exclude=['object']))
numerical_cols

In [None]:
### categorical
categorical_cols = list(df.select_dtypes(include=['object']))
categorical_cols

In [None]:
#Check target label split over categorical features and find the count
for categorical_feature in categorical_features:
    print(df.groupby(['Response',categorical_feature]).size())

In [None]:
# list of numerical variables
numerical_features = [feature for feature in df.columns if ((df[feature].dtypes != 'O') & (feature not in ['y']))]
print('Number of numerical variables: ', len(numerical_features))

# visualise the numerical variables
df[numerical_features].head()

In [None]:
#Discrete Numerical Features
discrete_feature=[feature for feature in numerical_features if len(df[feature].unique())<25]
print("Discrete Variables Count: {}".format(len(discrete_feature)))

In [None]:
#Continuous Numerical Features
continuous_features=[feature for feature in numerical_features if feature not in discrete_feature+['deposit']]
print("Continuous feature Count: {}".format(len(continuous_features)))

In [None]:
cols_with_missing = [col for col in df.columns 
                                 if df[col].isnull().any()]
cols_with_missing

In [None]:
df.describe()

In [None]:
df.corr()

In [None]:
df.hist(figsize=(20,20))
plt.show()

In [None]:
matrix = df.corr() 
f, ax = plt.subplots(figsize=(25, 12)) 
sns.heatmap(matrix, vmax=.8, square=True, cmap="RdYlGn",annot = True);

In [None]:
sns.pairplot(df)

In [None]:
def bar_plot(variable):
    var = df[variable]
    varValue = var.value_counts()
    plt.figure(figsize=(15,3))
    plt.bar(varValue.index, varValue,color=['#00008b','#00e5ee','#cd1076', '#008080','#cd5555','red','blue'])
    plt.xticks(varValue.index, varValue.index.values)
    plt.ylabel("Frequency")
    plt.title(variable)
    
    plt.show()
    print("{}: \n {}".format(variable,varValue))

In [None]:
categorical_cols = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']
for c in categorical_cols:
    bar_plot(c)

In [None]:
categorcial_variables = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']
for col in categorcial_variables:
    plt.figure(figsize=(10,4))
    #Returns counts of unique values for each outcome for each feature.
    pos_counts = df.loc[df.Response.values == 0, col].value_counts() 
    neg_counts = df.loc[df.Response.values == 1, col].value_counts()
    
    all_counts = list(set(list(pos_counts.index) + list(neg_counts.index)))
    
    #Counts of how often each outcome was recorded.
    freq_pos = (df.Response.values == 0).sum()
    freq_neg = (df.Response.values == 1).sum()
    
    pos_counts = pos_counts.to_dict()
    neg_counts = neg_counts.to_dict()
    
    all_index = list(all_counts)
    all_counts = [pos_counts.get(k, 0) / freq_pos - neg_counts.get(k, 0) / freq_neg for k in all_counts]

    sns.barplot(all_counts, all_index)
    plt.title(col)
    plt.tight_layout()

In [None]:
#check count based on categorical features
plt.figure(figsize=(15,80), facecolor='white')
plotnumber =1
for categorical_feature in categorical_features:
    ax = plt.subplot(11,2,plotnumber)
    sns.countplot(y=categorical_feature,data=df)
    plt.xlabel(categorical_feature)
    plt.title(categorical_feature)
    plotnumber+=1
plt.show()

In [None]:
#check target label split over categorical features
#Find out the relationship between categorical variable and dependent variable
for categorical_feature in categorical_features:
    sns.catplot(x = 'Response', col = categorical_feature, kind = 'count', data = df)
plt.show()

In [None]:
#boxplot to show target distribution with respect numerical features
plt.figure(figsize = (20,60), facecolor = 'white')
plotnumber = 1
for feature in continuous_features:
    ax = plt.subplot(12,3,plotnumber)
    sns.boxplot(x = "Response", y = df[feature], data = df)
    plt.xlabel(feature)
    plotnumber+=1
plt.show()

In [None]:
fig = plt.figure(figsize = [15,20])
cols = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']
cnt = 1
for col in cols :
    ax = plt.subplot(5,2,cnt)
    sns.countplot(data = df, x = col, order = df[col].value_counts().index)
    if col == 'Vehicle_Damage' :
        plt.xticks(rotation = 90)
    cnt+=1
    plot_name = "Countplot for column : "+col
    ax.set_title(plot_name,fontsize = 15)
plt.tight_layout()
plt.show()  

In [None]:
columns = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']
fig = plt.figure(figsize = (30, 20))
fig.subplots_adjust(hspace = 0.4, wspace = 0.4)
for i in range(1, (len(columns)**2)+1):
    ax = fig.add_subplot(len(columns), len(columns), i)
    ax = sns.countplot(df[columns[int((i-1)/len(columns))]],hue = columns[((i-1)%len(columns))],data = df)
    ax = plt.legend(loc = 'best')
plt.show()

In [None]:
def Count_categorcial_variables(df):
    categorcial_variables = df.select_dtypes(include=['object']).columns.tolist()
    #fig = plt.figure(figsize=(14, 18))

    for index, col in enumerate(categorcial_variables):
        print("------------",col," value counts---------------------")
        print(df[col].value_counts())
        #fig.add_subplot(3, 2, index+1)
        #dataframe[col].value_counts()[:20].plot(kind='bar', title=col, color = "royalblue")
        #plt.tight_layout()
        
    print("\n\n------------Number of categories in each columns---------------------")
    for i in categorcial_variables:
        a = df[i].unique()
        print("There are {} categories in {}".format(len(a),i))
Count_categorcial_variables(df)

# Univariate Analysis

In [None]:
df['Gender'].value_counts()

In [None]:
plt.subplots(figsize=(10,7))
sns.countplot(y = df['Gender'])

In [None]:
plt.figure(figsize=(10,5))
df['Gender'].value_counts().plot.pie(autopct="%0.2f%%")

In [None]:
df['Age'].value_counts()

In [None]:
plt.subplots(figsize=(20,10))
sns.countplot(y = df['Age'])

In [None]:
df['Driving_License'].value_counts()

In [None]:
plt.subplots(figsize=(10,7))
sns.countplot(y = df['Driving_License'])

In [None]:
plt.figure(figsize=(10,5))
df['Driving_License'].value_counts().plot.pie(autopct="%0.2f%%")

In [None]:
df['Region_Code'].value_counts()

In [None]:
plt.subplots(figsize=(15,10))
sns.countplot(y = df['Region_Code'])

In [None]:
plt.figure(figsize=(15,10))
df['Region_Code'].value_counts().plot.pie(autopct="%0.2f%%")

In [None]:
df['Previously_Insured'].value_counts()

In [None]:
plt.subplots(figsize=(10,5))
sns.countplot(y = df['Previously_Insured'])

In [None]:
plt.figure(figsize=(10,7))
df['Previously_Insured'].value_counts().plot.pie(autopct="%0.2f%%")

In [None]:
df['Vehicle_Age'].value_counts()

In [None]:
plt.subplots(figsize=(10,5))
sns.countplot(y = df['Vehicle_Age'])

In [None]:
plt.figure(figsize=(10,7))
df['Vehicle_Age'].value_counts().plot.pie(autopct="%0.2f%%")

In [None]:
df['Vehicle_Damage'].value_counts()

In [None]:
plt.subplots(figsize=(10,5))
sns.countplot(y = df['Vehicle_Damage'])

In [None]:
plt.figure(figsize=(10,7))
df['Vehicle_Damage'].value_counts().plot.pie(autopct="%0.2f%%")

In [None]:
df['Annual_Premium'].value_counts()

In [None]:
df['Policy_Sales_Channel'].value_counts()

In [None]:
df['Vintage'].value_counts()

In [None]:
df['Response'].value_counts()

In [None]:
plt.subplots(figsize=(10,5))
sns.countplot(y = df['Response'])

In [None]:
plt.figure(figsize=(10,7))
df['Response'].value_counts().plot.pie(autopct="%0.2f%%")

# Dist Plot

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(df["Age"])
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(df["Region_Code"])
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(df["Annual_Premium"])
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(df["Policy_Sales_Channel"])
plt.show()

In [None]:
plt.figure(figsize = (10, 10))
sns.distplot(df["Vintage"])
plt.show()

# Bar Plot

In [None]:
sns.barplot(x = 'Gender', y = 'Response', data = df)

In [None]:
plt.figure(figsize = (20, 7))
sns.barplot(x = 'Age', y = 'Response', data = df)

In [None]:
sns.barplot(x = 'Driving_License', y = 'Response', data = df)

In [None]:
sns.barplot(x = 'Previously_Insured', y = 'Response', data = df)

In [None]:
sns.barplot(x = 'Vehicle_Age', y = 'Response', data = df)

In [None]:
sns.barplot(x = 'Vehicle_Damage', y = 'Response', data = df)

In [None]:
# Top 10 regions with highest number of insurers
labels= df['Region_Code'].value_counts()[:10].keys()
values= df['Region_Code'].value_counts()[:10]

plt.figure(figsize = (10, 5))
graph = sns.barplot(x = labels, y = values)

for p in graph.patches:
        graph.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black')

In [None]:
# Top 10 policy channels covering highest number of insurers
labels= df['Policy_Sales_Channel'].value_counts()[:10].keys()
values= df['Policy_Sales_Channel'].value_counts()[:10]

plt.figure(figsize = (15, 5))
graph = sns.barplot(x = labels, y = values)

for p in graph.patches:
        graph.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black')

# Box Plot

In [None]:
sns.boxplot(x = 'Gender', y = 'Age', data = df)

In [None]:
plt.figure(figsize=(15,5))
sns.boxplot(y = 'Age', x = 'Gender', hue = "Previously_Insured", data = df)

# Count Plot

In [None]:
plt.figure(figsize = (10, 5))
sns.countplot(x = 'Gender', hue = 'Response', data = df)

In [None]:
plt.figure(figsize = (10, 5))
sns.countplot(x = 'Driving_License', hue = 'Response', data = df)

In [None]:
plt.figure(figsize = (10, 5))
sns.countplot(x = 'Previously_Insured', hue = 'Response', data = df)

In [None]:
plt.figure(figsize = (10, 5))
sns.countplot(x = 'Vehicle_Age', hue = 'Response', data = df)

In [None]:
plt.figure(figsize = (10, 5))
sns.countplot(x = 'Vehicle_Damage', hue = 'Response', data = df)

In [None]:
plt.figure(figsize = (10, 5))
sns.countplot(x = 'Gender', hue = 'Previously_Insured', data = df)

In [None]:
plt.figure(figsize = (10, 5))
sns.countplot(x = 'Gender', hue = 'Vehicle_Damage', data = df)

In [None]:
plt.figure(figsize = (10, 5))
sns.countplot(x = 'Gender', hue = 'Vehicle_Age', data = df)

In [None]:
plt.figure(figsize = (10, 5))
sns.countplot(x = 'Previously_Insured', hue = 'Vehicle_Damage', data = df)

In [None]:
plt.figure(figsize=(20,7))
sns.countplot(df['Previously_Insured'],hue = df['Response'], palette = ['Brown','Purple'])

In [None]:
fig, ax =plt.subplots(1,2,figsize=(20,7))
# fig, ax = plt.subplots() 
sns.countplot(data = df,x = 'Gender',hue = 'Previously_Insured',ax = ax[0])                                                                                                               
sns.countplot(data = df,x = 'Gender',hue = 'Vehicle_Damage',ax = ax[1])
fig.show()

In [None]:
fig, ax =plt.subplots(1,2,figsize=(20,7))
sns.countplot(data = df,x = 'Gender',hue = 'Vehicle_Age',ax = ax[0])
sns.countplot(data = df,x = 'Previously_Insured',hue = 'Vehicle_Damage',ax = ax[1])
fig.show()

In [None]:
fig, ax =plt.subplots(1,2,figsize=(20,7))
sns.countplot(data = df,x = 'Gender',hue = 'Vehicle_Damage',ax = ax[0])
sns.countplot(data = df,x = 'Gender',hue = 'Previously_Insured',ax = ax[1])
fig.show()

In [None]:
def without_hue(plot, feature):
    total = len(feature)
    for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_height()/total)
        x = p.get_x() + p.get_width() / 2 - 0.05
        y = p.get_y() + p.get_height()
        ax.annotate(percentage, (x, y), size = 12)
    plt.show()

In [None]:
plt.figure(figsize = (5, 5))
ax = sns.countplot(df['Response'], palette = 'Oranges_r')
without_hue(ax, df['Response'])

In [None]:
# Counts of Categorical variables
fig=plt.figure(figsize=(15,10))
fig.subplots_adjust(hspace = .3, wspace=.2)
x = ['Gender','Driving_License','Previously_Insured','Vehicle_Age','Vehicle_Damage']
for i in range(0,len(x)):
    ax=fig.add_subplot(2,3,i+1).set_title(x[i])
    graph = sns.countplot(df[x[i]])
    
    for p in graph.patches:
        graph.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.35, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black')

In [None]:
# Top 10 regions with highest number of insurers
labels= df['Region_Code'].value_counts()[:10].keys()
values= df['Region_Code'].value_counts()[:10]

plt.figure(figsize = (10, 5))
graph = sns.barplot(x = labels, y = values)

for p in graph.patches:
        graph.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black')

In [None]:
# Top 10 policy channels covering highest number of insurers
labels= df['Policy_Sales_Channel'].value_counts()[:10].keys()
values= df['Policy_Sales_Channel'].value_counts()[:10]

plt.figure(figsize = (15, 5))
graph = sns.barplot(x = labels, y = values)

for p in graph.patches:
        graph.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.4, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black')

In [None]:
# Exploratory Data Analysis of Categorical Column
fig, ax = plt.subplots(3,2,figsize = (15,18))

cols = ["Gender","Vehicle_Damage","Vehicle_Age"]
for i in np.arange(len(cols)):
    sns.countplot(df[cols[i]], ax=ax[i][0])
    sns.countplot(df[cols[i]], hue=df["Response"], ax=ax[i][1])
plt.show()

In [None]:
plt.figure(figsize = (13,5))
plt.subplot(1,2,1)
sns.countplot(df['Gender'])
plt.title("Count of Gender")
plt.subplot(1,2,2)
sns.countplot(df['Gender'], hue = df['Response'],palette = "rocket_r")
plt.title("Responses in Male and Female")
plt.show()

In [None]:
plt.figure(figsize = (13,5))
plt.subplot(1,2,1)
sns.countplot(df['Vehicle_Damage'])
plt.title("Count of Yes and No")
plt.subplot(1,2,2)
sns.countplot(df['Vehicle_Damage'], hue = df['Response'],palette="rocket_r")
plt.title("Responses in Yes and No category")
plt.show()

In [None]:
plt.figure(figsize = (13,5))
plt.subplot(1,2,1)
sns.countplot(df['Vehicle_Age'])
plt.title("Count of Vehicle Age Categories")
plt.subplot(1,2,2)
sns.countplot(df['Vehicle_Age'], hue = df['Response'],palette="rocket_r")
plt.title("Responses by Age categories")
plt.show()

In [None]:
plt.figure(figsize = (13,5))
plt.subplot(1,2,1)
sns.countplot(df['Driving_License'])
plt.title("count of Driving License categories")
plt.subplot(1,2,2)
sns.countplot(df['Driving_License'], hue = df['Response'],palette="rocket_r")
plt.title("Responses by Driving License categories")
plt.show()

In [None]:
plt.figure(figsize = (20,5))
plt.subplot(2,1,1)
sns.countplot(df['Region_Code'])
plt.title("Region Representation in the data")

In [None]:
plt.figure(figsize = (13,5))
plt.subplot(1,2,1)
sns.countplot(df['Previously_Insured'])
plt.title("Customers with and without Vehicle Insurance")
plt.subplot(1,2,2)
sns.countplot(df['Previously_Insured'], hue = df['Response'],palette="rocket_r")
plt.title("Response by whether customer has a Vehicle Insurance")
plt.show()

In [None]:
#Counts of Response for each categorical feature
fig=plt.figure(figsize=(20,7))
fig.subplots_adjust(hspace = .5, wspace=.3)
x = ['Gender','Driving_License','Previously_Insured','Vehicle_Age','Vehicle_Damage']
for i in range(0,len(x)):
    ax=fig.add_subplot(2,3,i+1).set_title(x[i])
    graph = sns.countplot(df[x[i]],hue = df['Response'])
    
    for p in graph.patches:
        graph.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.2, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black')

In [None]:
age_bin = [18, 35,45, 60, df['Age'].max()] 
labels_age = ['18-35', '36-45','46-60', '60<'] 
df['age_bin'] = pd.cut(df['Age'], bins=age_bin, labels = labels_age)

In [None]:
(pd.crosstab(index = df['age_bin'], columns = df['Response'], normalize = True)*100).round(2)

In [None]:
value_bin = [0, 30000, 250000, df['Annual_Premium'].max()]
label_bin = ['0 - 50000','50000 - 150000', 'more than 150000']

df['Premi_group'] = pd.cut(df['Annual_Premium'], bins = value_bin, labels = label_bin)

In [None]:
plt.figure(figsize=(10,5))
fig = sns.countplot(x = df['Premi_group'], hue = df['Response'], palette = 'Oranges_r')
plt.show(fig)

In [None]:
value_bin = [0, 100, 200, df['Vintage'].max()]
label_bin = ['0 - 100 months','100 - 200 months', 'more than 200 months']

df['Vintage_group'] = pd.cut(df['Vintage'], bins = value_bin, labels = label_bin)

# Strip Plot

In [None]:
plt.figure(figsize=(12,9))
sns.stripplot(x = 'Vintage_group', y = 'Annual_Premium', data = df, palette='Oranges_r', dodge=True)
plt.show()

In [None]:
age = [17, 25, 45, 65, df['Age'].max()]
label = ['Milenial', 'Adult', 'Elderly 1', 'Eldery 2']

df['Age_group'] = pd.cut(df['Age'], bins = age, labels = label)

In [None]:
plt.figure(figsize=(12,9))
sns.stripplot(x = 'Age_group', y = 'Annual_Premium', data = df, hue='Response', palette='Oranges_r', dodge=True)
plt.show()

# Violin Plot

In [None]:
plt.figure(figsize = (10,10))
sns.violinplot(x = 'Response', y = 'Vintage', data = df)

In [None]:
plt.figure(figsize = (10,10))
sns.violinplot(x = 'Gender', y = 'Age', data = df)

In [None]:
plt.figure(figsize = (10,10))
sns.violinplot(x = 'Response', y = 'Age', data = df)

In [None]:
plt.figure(figsize = (10,10))
sns.violinplot('Response', 'Region_Code',data = df)

In [None]:
plt.figure(figsize=(15,5))
sns.violinplot(y = 'Age', x = 'Gender', hue = "Response", data = df)

In [None]:
plt.figure(figsize=(12,8))
sns.violinplot(x = "Driving_License", y = "Age", hue = "Response", data = df, palette = "muted", split = True)
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.violinplot(x = "Gender", y = "Age", hue = "Response", data = df, palette = "muted", split = True)
plt.show()

# Scatter Plot

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(x = 'Age', y = 'Annual_Premium', data = df, hue = 'Response')
plt.show()

In [None]:
plt.figure(figsize=(12,8))
sns.scatterplot(y = 'Vintage',x = 'Annual_Premium',data = df,hue = 'Response')
plt.show()

# Strip Plot

In [None]:
plt.figure(figsize = (10,10))
sns.stripplot(x = "Gender", y = "Age", data = df)

In [None]:
plt.figure(figsize = (10,10))
sns.stripplot(x = "Response", y = "Age", data = df)

In [None]:
plt.figure(figsize = (10,10))
sns.stripplot(x = "Driving_License", y = "Age", data = df)

In [None]:
plt.figure(figsize = (10,10))
sns.stripplot(x = "Vehicle_Age", y = "Age", data = df)

In [None]:
plt.figure(figsize = (10,10))
sns.stripplot(x = "Vehicle_Damage", y = "Age", data = df)

In [None]:
plt.figure(figsize = (10,10))
sns.stripplot(x = "Response", y = "Vintage", data = df)

# Point Plot

In [None]:
figures, axes = plt.subplots(3, 2, figsize=(20, 15))
ax = sns.pointplot(y = 'Vintage', x = 'Vehicle_Damage', hue = 'Response', data = df, ax = axes[0,0])
ax = sns.pointplot(y = 'Age', x = 'Gender', hue = 'Response', data = df, ax = axes[0,1])
ax = sns.pointplot(y = 'Previously_Insured', x = 'Driving_License', hue = 'Response', data = df, ax = axes[1,0])
ax = sns.pointplot(y = 'Annual_Premium', x = 'Vehicle_Damage', hue = 'Response',data = df,ax = axes[1,1])
ax = sns.pointplot(y = 'Annual_Premium', x = 'Vehicle_Damage', hue = 'Previously_Insured', data = df, ax = axes[2,0])
ax = sns.pointplot(y = 'Annual_Premium', x = 'Vehicle_Age', hue = 'Response', data = df, ax = axes[2,1])

plt.show()

# Line Plot

In [None]:
plt.figure(figsize=(20,7))
sns.lineplot(df["Age"],df["Vehicle_Damage"])

In [None]:
plt.figure(figsize=(20,7))
sns.lineplot(df["Age"],df["Response"])

In [None]:
plt.figure(figsize=(20,7))
sns.lineplot(df["Policy_Sales_Channel"],df["Response"])

In [None]:
plt.figure(figsize=(20,7))
sns.lineplot(df["Region_Code"],df["Response"])

In [None]:
fig = plt.figure(figsize=(20,7))
df_age =  df['Response'].groupby(df.Age).count()
df_age = pd.DataFrame(df_age)
sns.lineplot(df_age.index,df_age.Response)

In [None]:
fig = plt.figure(figsize=(20,7))
df_sa =  df['Response'].groupby(df.Policy_Sales_Channel).count()
df_sa = pd.DataFrame(df_sa)
sns.lineplot(df_sa.index,df_sa.Response)

In [None]:
fig = plt.figure(figsize=(20,7))
df_region =  df['Response'].groupby(df.Region_Code).count()
df_region = pd.DataFrame(df_region)
sns.lineplot(df_region.index,df_region.Response)

In [None]:
plt.figure(figsize=(20,7))
sns.lineplot(x = df['Age'], y = df['Response'],marker = '*', linestyle = '--', color = 'red')

In [None]:
plt.figure(figsize=(20,7))
sns.lineplot(x = df['Policy_Sales_Channel'], y = df['Response'],marker = '*', linestyle = '--', color = 'red')

In [None]:
plt.figure(figsize=(20,7))
sns.lineplot(x = df['Region_Code'], y = df['Response'],marker = '*', linestyle = '--', color = 'red')

# Cross Tab

In [None]:
pd.crosstab(df['Gender'],df['Response']).style.background_gradient(cmap = 'winter')

In [None]:
pd.crosstab(df['Age'],df['Response']).style.background_gradient(cmap = 'spring')

In [None]:
pd.crosstab(df['Driving_License'],df['Response']).style.background_gradient(cmap = 'autumn')

In [None]:
pd.crosstab(df['Region_Code'],df['Response']).style.background_gradient(cmap = 'cool')

In [None]:
pd.crosstab(df['Previously_Insured'],df['Response']).style.background_gradient(cmap = 'Wistia')

In [None]:
pd.crosstab(df['Vehicle_Age'],df['Response']).style.background_gradient(cmap = 'bwr')

In [None]:
pd.crosstab(df['Vehicle_Damage'],df['Response']).style.background_gradient(cmap = 'seismic')

In [None]:
pd.crosstab(df['Policy_Sales_Channel'],df['Response']).style.background_gradient(cmap = 'PRGn')

In [None]:
pd.crosstab(df['Vintage'],df['Response']).style.background_gradient(cmap = 'PuOr')

In [None]:
((pd.crosstab(index = df['Vintage_group'], columns = df['Response'], normalize = 'columns')*100)).round(2)

# Pivot Table

In [None]:
table = pd.pivot_table(data = df,index = ['Gender'])
table

In [None]:
table = pd.pivot_table(data = df,index = ['Age'])
table

In [None]:
table = pd.pivot_table(data = df,index = ['Driving_License'])
table

In [None]:
table = pd.pivot_table(data = df,index = ['Region_Code'])
table

In [None]:
table = pd.pivot_table(data = df,index = ['Previously_Insured'])
table

In [None]:
table = pd.pivot_table(data = df,index = ['Vehicle_Age'])
table

In [None]:
table = pd.pivot_table(data = df,index = ['Vehicle_Damage'])
table

In [None]:
table = pd.pivot_table(data = df,index = ['Gender','Response'])
table

In [None]:
table = pd.pivot_table(data = df,index = ['Age','Response'])
table

In [None]:
table = pd.pivot_table(data = df,index = ['Driving_License','Response'])
table

In [None]:
table = pd.pivot_table(data = df,index = ['Region_Code','Response'])
table

In [None]:
table = pd.pivot_table(data = df,index = ['Previously_Insured','Response'])
table

In [None]:
table = pd.pivot_table(data = df,index = ['Vehicle_Age','Response'])
table

In [None]:
table = pd.pivot_table(data = df,index = ['Vehicle_Damage','Response'])
table

In [None]:
table = pd.pivot_table(data = df,index = ['Vintage','Response'])
table

# FaceGrid and KdePlot

In [None]:
g = sns.FacetGrid(df, hue='Response', height = 7)
g.map(sns.kdeplot, 'Age')
plt.legend()
plt.show()

In [None]:
g = sns.FacetGrid(df, hue='Gender', height = 7)
g.map(sns.kdeplot, 'Age')
plt.legend()
plt.show()

In [None]:
g = sns.FacetGrid(df, hue='Vehicle_Age', height = 7, aspect = 2)
g.map(sns.kdeplot, 'Age')
plt.legend()
plt.show()

In [None]:
g = sns.FacetGrid(df, hue='Response', height = 7,  aspect = 2, xlim = (0, 125000))
g.map(sns.kdeplot, 'Annual_Premium')
plt.legend()
plt.show()

In [None]:
g = sns.FacetGrid(df, hue='Response', height = 7)
g.map(sns.kdeplot, 'Vintage')
plt.legend()
plt.show()

In [None]:
cont_var = ['Age', 'Annual_Premium', 'Vintage']
fig, ax = plt.subplots(2,2, figsize=(30,10))

for i in range(len(cont_var)):
    sns.kdeplot(df.loc[df['Response']==0, cont_var[i]], label='0', ax=ax[int(i/2)][i%2])
    sns.kdeplot(df.loc[df['Response']==1, cont_var[i]], label='1', ax=ax[int(i/2)][i%2])
    
    ax[int(i/2)][i%2].set_title(f'KDE plot of {cont_var[i]}', fontsize=20)

In [None]:
plt.figure(figsize=(15,6))
sns.kdeplot(df['Annual_Premium'])
plt.xlabel('Annual Premium', fontsize = 14)
plt.title('Annual Premium distribution', fontsize = 18)
plt.show()

In [None]:
df.loc[df['Gender'] == 'Male', 'Gender'] = 1
df.loc[df['Gender'] == 'Female', 'Gender'] = 0

df.loc[df['Vehicle_Age'] == '> 2 Years', 'Vehicle_Age'] = 2
df.loc[df['Vehicle_Age'] == '1-2 Year', 'Vehicle_Age'] = 1
df.loc[df['Vehicle_Age'] == '< 1 Year', 'Vehicle_Age'] = 0

df.loc[df['Vehicle_Damage'] == 'Yes', 'Vehicle_Damage'] = 1
df.loc[df['Vehicle_Damage'] == 'No', 'Vehicle_Damage'] = 0
df.head()

In [None]:
df = df.dropna()
#print(df)
df.head()

In [None]:
df.dropna(inplace = True)
df.isnull().sum()

In [None]:
df.columns

# Training And Testing Data

In [None]:
#Classifiers
X = df.loc[:, ['Gender', 'Age', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 
               'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']]
X.head()

In [None]:
Y = df.loc[:, ['Response']]
Y.head()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25, random_state=42, shuffle = True)

# Linear Regression

In [None]:
regressor = LinearRegression()  
regressor.fit(X_train, Y_train) #training the algorithm
#To retrieve the intercept:
print(regressor.intercept_)

#For retrieving the slope:
print(regressor.coef_)

In [None]:
Y_pred = regressor.predict(X_test)
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, Y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, Y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, Y_pred)))

In [None]:
# Model initialization
regression_model = LinearRegression()
# Fit the data(train the model)
regression_model.fit(X, Y)
# Predict
Y_pred = regression_model.predict(X)

# model evaluation
rmse = mean_squared_error(Y, Y_pred)
r2 = r2_score(Y, Y_pred)

# printing values
print('Slope:' ,regression_model.coef_)
print('Intercept:', regression_model.intercept_)
print('Root mean squared error: ', rmse)
print('R2 score: ', r2)

In [None]:
import statsmodels.api as sm

X = np.random.rand(100)
Y = X + np.random.rand(100)*0.1

results = sm.OLS(Y,sm.add_constant(X)).fit()

print(results.summary())

plt.scatter(X,Y)

X_plot = np.linspace(0,1,100)
plt.plot(X_plot, X_plot*results.params[0] + results.params[1])

plt.show()

In [None]:
X = df.iloc[:, 0].values.reshape(-1, 1)
Y = df.iloc[:, 1].values.reshape(-1, 1)
linear_regressor = LinearRegression()
linear_regressor.fit(X, Y)
Y_pred = linear_regressor.predict(X)

In [None]:
plt.scatter(X, Y)
plt.plot(X, Y_pred, color='red')
plt.show()

In [None]:
from sklearn import linear_model
# with sklearn
regr = linear_model.LinearRegression()
regr.fit(X, Y)

print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)

# Random Forest Classifier Model

In [None]:
#RANDOM FOREST MODEL
random_classifier = RandomForestClassifier(n_estimators=5, random_state=0)
random_classifier.fit(X_train, Y_train)
random_prediction = random_classifier.predict(X_test)

In [None]:
#Model Accuracy
print(confusion_matrix(Y_test, random_prediction))
print(classification_report(Y_test, random_prediction))
print('Model Accuracy: ',accuracy_score(Y_test, random_prediction))

In [None]:
# create and fit RandomForestClassifier model  
rfc=RandomForestClassifier()
rfc.fit(X_train, Y_train)

In [None]:
#predict
pred = rfc.predict(X_test)
pred

In [None]:
rfc_acc= accuracy_score(Y_test, pred)
print('The Accuracy Score Using The Random Forest Classifier (before resample) is :',rfc_acc)

In [None]:
print(classification_report(Y_test, pred))

# Decision Tree Classifier Model

In [None]:
# create and fit DecisionTreeClassifier model
dtc = DecisionTreeClassifier()
dtc.fit(X_train,Y_train)

In [None]:
#predict
Y_pred = dtc.predict(X_test)
Y_pred

In [None]:
cm = confusion_matrix(Y_test, Y_pred)  
cm

In [None]:
dtc_acc = accuracy_score(Y_test, Y_pred)
dtc_acc
print('The accuracy score with using the decision tree classifier is :',dtc_acc)

In [None]:
print(classification_report(Y_test, Y_pred))

# K Nearnest NeighborsClassifier Model

In [None]:
# create and fit KNeighborsClassifier model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train,Y_train)

In [None]:
#predict
Y_pred = knn.predict(X_test)

In [None]:
cm = confusion_matrix(Y_test, Y_pred)  
cm

In [None]:
#KNN accuracy score

Knn_acc= accuracy_score(Y_test, knn.predict(X_test))
print('The accuracy socre using the KNeighborsClassifier is :',Knn_acc)

In [None]:
print(classification_report(Y_test, Y_pred))

# Evaluation
Comparing Model Accuracy

In [None]:
pd.DataFrame([rfc_acc, dtc_acc, Knn_acc]).plot.bar();
plt.xticks(np.arange(3),('RFC','DTC','KNN'))
plt.legend().remove()
plt.ylim(0,1)
plt.ylabel('Accuracy')
plt.xlabel('Models')
plt.xticks(rotation = 0)
plt.title('Comparing Model Accuracy');
# plt.savefig("abc.png")
# files.download("abc.png")

In [None]:
# ROC/AUC curve
plt.figure(figsize = (10,10))
ax = plt.gca()
ax.set_title('Receiver Operating Characteristic',size = 15)
rfc_disp = plot_roc_curve(rfc, X_test, Y_test, ax = ax, alpha = 0.8, name = 'RandomForestClassifier')
tree_disp = plot_roc_curve(dtc, X_test, Y_test,  ax = ax, alpha = 0.8, name = 'DecisionTreeClassifier')
knn_disp =  plot_roc_curve(knn, X_test, Y_test,  ax = ax, alpha = 0.8, name = 'KNeighborsClassifier')