In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

1. **Import Libraries**

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn import neighbors
from sklearn.metrics import confusion_matrix, precision_score, recall_score

**2. Import Data**

This is a dataset from IBM Watson Analytics. This dataset gives you information about your customers. You can predict their behavior to retain your customers. You can analyze all relevant customer data and develop focused customer retention programs. And to understand customer demographics and buying behavior. Use predictive analytics to analyze the most profitable customers and how they interact. Take targeted actions to increase profitable customer response, retention, and growth.

In [None]:
data = pd.read_csv("/kaggle/input/ibm-watson-marketing-customer-value-data/WA_Fn-UseC_-Marketing-Customer-Value-Analysis.csv")

In [None]:
data.head()

**3. Exploratory Data Analysis (EDA)**

In [None]:
data.columns

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
data.describe()

Your data contains 9134 customers with information about their income, education, gender,residence and so on. Each customer owns a car and you as entrepreneur offers 4 different car insurances to them. The target of this dataset is the Response. The response can be "Yes" - the customer accept the offer and "No" - the customer didn´t accept the offer.

In [None]:
sns.countplot("Response", hue="Gender", data = data)

In [None]:
data.Response.value_counts()

Only 1308 customer accept the offer.

In [None]:
print("Only",round((len(data[(data.Response == "Yes")])/len(data.Response)*100),2),"%","of our customer accept an offer made by your Sales Team.")

In [None]:
data.groupby("Sales Channel").agg({"Response":"count"})

The most Offers were made by agents (3477 offers), the least via web.

In [None]:
channel = list(data["Sales Channel"].unique())
for i in channel:
    output = len(data[(data["Sales Channel"] == i) & 
                      (data["Response"] == "Yes")]) /len(data[(data["Sales Channel"] == i)])
    print(round((output * 100),2), "% of offers via the Sales Channel", i, "were accepted.")

In [None]:
data.dtypes

In [None]:
objects = ["State","Response","Coverage","Education","EmploymentStatus",
           "Gender","Location Code","Marital Status","Policy Type","Policy","Renew Offer Type","Sales Channel",
           "Vehicle Class","Vehicle Size"]

for obj in objects:
    print(data[obj].value_counts())

**Results**

All categorial features are well distributet, so I will keep them and encode them to numerical data.

Some columns don´t make sense or are not so important, e.g. Customer (because it´s just a unique number), Policy is the same as Policy Type, Effective To Date is also not important, so I will drop them.

The data is inbalanced regarding the outcome "Response"

**4. Data Analysis**

In [None]:
data = data.drop(columns={"Customer","Policy", "Effective To Date"})

In [None]:
# Define a list with all features which are categorial

data_categorial = data.select_dtypes(include=["object"])
categories = list(data_categorial.columns)
categories

In [None]:
# Encode the categorial Data to numerical

lb = LabelEncoder()

for i in categories:
    data[i] = lb.fit_transform(data[i])


In [None]:
sns.heatmap(data.corr())

**5. Supervised Machine Learning with imbalanced data**

Let´s start now with the prediction of the response future customers.
For that we have to find the right model. Since the data has a target which is separated into Yes/No, we can use the Classification of supervised machine learning. 
Following models can be used:

* Logistic Regression
* KNeighbours Classifier
* Support Vector Machine
* Decision Tree

In [None]:
y = data["Response"]

In [None]:
X = data.drop(["Response"], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=29)

In [None]:
lr = LogisticRegression()
# initialize the model (=lr)

lr.fit(X_train,y_train)
#fit the model to the train set

acc = lr.score(X_test,y_test)*100
# comapring the test with the data

print("Logistic Regression Test Accuracy", round(acc, 2),"%")

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 2)  # n_neighbors means k
knn.fit(X_train, y_train)
# prediction = knn.predict(x_test)

y_pred = knn.predict(X_test)

acc = knn.score(X_test, y_test)*100
print("2 neighbors KNN Score: ",round(acc,2),"%")

In [None]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)

acc = svm.score(X_test,y_test)*100
print("SVM Algorithm Test Accuracy", round(acc, 2),"%")

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

acc = dtc.score(X_test, y_test)*100
print("Decision Tree Test Accuracy", round(acc, 2),"%")

**Results**

The Models have a really high Accuracy, the model Support Vector Machine seems to be the best decision to use with more than 99% Accuracy. 

BUT! this is because the data is imbalanced. The Response with "No" has a percentage of 86%, so the models are not useful and don´t give the accurate view of the data.

**6. Supervised Learning with balanced Data**

To have a better view on the data, I´m going to downsample the target. This is better than oversampling in my opinion, so we don´t give too nuch wight to one certain target.

In [None]:
#Downsampling:

#1. Test-Train Split!!
# concatenate our training data back together

X_down = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes

no_effect = X_down[X_down.Response==0]
effect = X_down[X_down.Response==1]

# downsample majority

no_effect_downsampled = resample(no_effect,
                               replace = False, # sample without replacement
                               n_samples = len(effect), # match minority n
                               random_state = 27) # reproducible results

# combine minority and downsampled majority

downsampled = pd.concat([no_effect_downsampled, effect])

# checking counts

downsampled.Response.value_counts()

In [None]:
downsampled.shape

In [None]:
y_train_down = downsampled.Response

In [None]:
X_train_down = downsampled.drop(["Response"], axis = 1)

6.1. **LOGISTIC REGRESSION**

In [None]:
lr = LogisticRegression()
# initialize the model (=lr)

lr.fit(X_train_down,y_train_down)
#fit the model to the train set

y_pred = lr.predict(X_test)

acc = lr.score(X_test,y_test)*100
# comapring the test with the data

print("Prediction",y_pred[:5])
print("Logistic Regression Test Accuracy", round(acc, 2),"%")

Accuracy is very bad, let´s try another model

6.2. **K-NEAREST NEIGHBOUR** 

In [None]:
n_neighbors = 2
knn = KNeighborsClassifier(n_neighbors = n_neighbors)  # n_neighbors means k
knn.fit(X_train_down, y_train_down)

y_pred = knn.predict(X_test)

acc = knn.score(X_test, y_test)*100

print("Prediction:", y_pred[:5])
print(n_neighbors,"neighbors KNN Score: ",round(acc,2),"%")

In [None]:
acc_train = knn.score(X_train, y_train)*100
print("The accuracy score for the training data is: ",round(acc_train,2),"%")
acc_test = knn.score(X_test,y_test)*100
print("The accuracy score for the test data is: ",round(acc_test,2),"%")



In [None]:
cv_results = cross_val_score(knn, X_train_down,y_train_down, cv = 5)
cv_results

Accuracy is better and also the data is continuous.

6.3. **DECISION TREE**

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train_down, y_train_down)

y_pred_dtc = dtc.predict(X_test)

acc_dtc = dtc.score(X_test, y_test)*100

print("Prediction", y_pred_dtc[:5])
print("Decision Tree Test Accuracy", round(acc_dtc, 2),"%")

In [None]:
acc_train = dtc.score(X_train, y_train)*100
print("The accuracy score for the training data is: ",round(acc_train,2),"%")
acc_test = dtc.score(X_test,y_test)*100
print("The accuracy score for the test data is: ",round(acc_test,2),"%")

In [None]:
cv_results = cross_val_score(dtc, X_train_down,y_train_down, cv = 5)
cv_results

In [None]:
cnf_matrix = confusion_matrix(y_test, y_pred_dtc)
cnf_matrix

In [None]:
dtc_recall = recall_score(y_test, y_pred_dtc)
dtc_recall

In [None]:
271/(271+4)

In [None]:
dtc_precision = precision_score(y_test,y_pred_dtc)
dtc_precision

The Decision Tree has the best accuracy and trainset is continuous. Recall is very high - that´s good. So the model can predict quite well that a customer won´t accept the offer. In this case, as an entrepreneur you know now in which customer you shouldn´t invest. So focus on those customers which will accept an offer.