# Task for Today  

***

## Car Insurance Cold Call Success Prediction  

Given *data about car insurance cold calls*, let's try to predict whether a given call will be **successful** or not.

We will use a variety of classification models to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
data = pd.read_csv('../input/carinsurance/carInsurance_train.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop Id column
    df = df.drop('Id', axis=1)
    
    # Drop Outcome column (too many missing values)
    df = df.drop('Outcome', axis=1)
    
    # Fill categorical missing values with column modes
    for column in ['Job', 'Education', 'Communication']:
        df[column] = df[column].fillna(df[column].mode()[0])
    
    # Extract duration feature
    df['CallDuration'] = (pd.to_datetime(df['CallEnd']) - pd.to_datetime(df['CallStart'])).apply(lambda x: x.seconds)
    df = df.drop(['CallStart', 'CallEnd'], axis=1)
    
    # Binary encoding
    df['Communication'] = df['Communication'].replace({'telephone': 0, 'cellular': 1})
    
    # Ordinal encoding
    df['Education'] = df['Education'].replace({'primary': 0, 'secondary': 1, 'tertiary': 2})
    df['LastContactMonth'] = df['LastContactMonth'].replace({
        'jan': 0, 'feb': 1, 'mar': 2, 'apr': 3, 'may': 4, 'jun': 5, 'jul': 6, 'aug': 7, 'sep': 8, 'oct': 9, 'nov': 10, 'dec': 11
    })
    
    # One-hot encoding
    for column in ['Job', 'Marital']:
        df = onehot_encode(df, column)
    
    # Split df into X and y
    y = df['CarInsurance']
    X = df.drop('CarInsurance', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
X_train

In [None]:
y_train

# Training

In [None]:
models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

# Results

In [None]:
for name, model in models.items():
    print(name + ": {:.2f}%".format(model.score(X_test, y_test) * 100))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/m0Vl--yH578