In [None]:
import numpy as np 
import pandas as pd
pd.options.display.max_columns = None
%config Completer.use_jedi = False

from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
train_data = pd.read_csv('../input/carinsurance/carInsurance_train.csv')
test_data = pd.read_csv('../input/carinsurance/carInsurance_test.csv')

In [None]:
train_data.head()

# Preprocessing

In [None]:
def onehot_encode(train_data, columns):
    dummies = pd.get_dummies(train_data[columns], prefix = columns)
    train_data = pd.concat([train_data, dummies], axis =1)
    train_data = train_data.drop(columns, axis=1)
    
    return train_data

In [None]:
def preprocess_inputs(train_data):
    
    train_data = train_data.copy()
    
    # Days passed have more than 3000 values as -1 replacing it with nan to drop.
    
    train_data['DaysPassed'] = train_data['DaysPassed'].replace(-1, np.nan)
    
    # Droping unwanted features
    
    train_data.drop('Id', axis=1, inplace=True)
    
    null_columns = train_data.loc[:, train_data.isna().mean() > 0.25].columns
    train_data.drop(null_columns, axis =1, inplace=True)
            
    
    # maping and treating the null values
    
    train_data['Communication'] = train_data.Communication.map({'telephone' : 0, 'cellular' : 1})
    train_data['Communication'] = train_data['Communication'].fillna(train_data['Communication'].median())
    train_data['Communication'] = train_data['Communication'].astype('int')
    
    train_data['Education'] =train_data.Education.map({'tertiary' : 3, 'primary' : 1 , 'secondary' : 2,})
    train_data['Education'] = train_data['Education'].fillna(train_data['Education'].median())
    train_data['Education'] = train_data['Education'].astype('int')
    
    # To drop only 19 missing values from jobs
    
    train_data=train_data.dropna()
    
    # mapping categorical columns
    
    train_data['Marital'] = train_data.Marital.map({'single' : 1, 'married' : 2, 'divorced': 3})
    train_data['LastContactMonth'] = train_data.LastContactMonth.map({'jan' :1, 'may' :5, 'jun' : 6, 'mar' : 3, 'nov' :11, 'jul' : 7, 'aug' :8, 'sep':9, 'apr'  :4,
       'feb' : 2, 'oct' :10, 'dec' :12})
    
    # encode jobs feature
    
    train_data = onehot_encode(train_data, 'Job')
    
    
    # Duratin column

    train_data['callduration'] = (pd.to_datetime(train_data['CallEnd']) - pd.to_datetime(train_data['CallStart'])).apply(lambda x : x.seconds)
    
    train_data.drop(['CallEnd', 'CallStart'], axis=1, inplace=True)
    
    return train_data


In [None]:
train_data = preprocess_inputs(train_data)

# Scaling and Splitting

In [None]:
X = train_data.drop('CarInsurance',axis =1)
y = train_data['CarInsurance']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y, train_size = 0.7, random_state =1)

In [None]:
sc.fit(X_train)

X_train = pd.DataFrame(sc.transform(X_train), columns =X.columns)
X_test = pd.DataFrame(sc.transform(X_test), columns =X.columns)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
models = {
    'Knn' : KNeighborsClassifier(),
    'lr' : LogisticRegression(),
    'LSVC': LinearSVC(), 
    'svc' : SVC(),
    'GB' : GradientBoostingClassifier()
    
}

for name,model in models.items():
    model.fit(X_train,y_train)
    print(name + ' trained')

# Results

In [None]:
for name, model in models.items():
    print(name + 'Acuuracy : {:.2f}%'. format(model.score(X_test,y_test)* 100))