In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('../input/imbalanced-data-practice/aug_train.csv')
test =  pd.read_csv('../input/imbalanced-data-practice/aug_test.csv')

In [None]:
print(train.shape)
train.head()

In [None]:
print(test.shape)
test.head()

In [None]:
train.info()

### There are no null values in the data
---
### Next I examine the target variable 'Response'

In [None]:
# Check target variable (here binary)
train['Response'].unique()

In [None]:
# Check Response percent breakdown
print(f'Response Negative: {round(len(train[train["Response"] == 0]) / train.shape[0], 4) * 100}%')
print(f'Response Positive: {round(len(train[train["Response"] == 1]) / train.shape[0], 4) * 100}%')

### The target variable is highly imbalanced with approximately 16% of its values being positive.

---

### Next I combine the test and training data sets so that any changes performed will only have to be done once.

In [None]:
# Combine test and training sets
comb_df= pd.concat([train, test]).reset_index(drop=True)
comb_df.drop(columns='Response', inplace=True)
comb_df.info()

In [None]:
# Check that all id values are unique
len(comb_df['id'].unique())

---
## Feature Examination & Adjustment

### 1. Gender

In [None]:
comb_df['Gender'].value_counts()


### Gender has a pretty even split between male and female customers, next I will replace the values like so:
* Male : 0
* Female : 1

In [None]:
# Convert Male:0, Female:1
comb_df['Gender'].replace({'Male' : 0,
                           'Female' : 1}, inplace=True)

### 2. Age

In [None]:
comb_df['Age'].describe()

In [None]:
comb_df['Age'].hist(grid=False)

### The vast majority of ages are between 20 and 50, I opt to create bins here to reduce the number of potential ages

In [None]:
# Convert Age values
bins = [20, 30, 40, 50, 60, 85]

# Create Bins
comb_df['age_bins'] = pd.cut(x=comb_df['Age'], bins=bins, labels=['20-30', '30-40', '40-50', '50-60', '60+'])
comb_df.drop(columns='Age', inplace=True)

In [None]:
comb_df['age_bins'].value_counts()

In [None]:
# Create 'dummy' values for each bin
age_dummies = pd.get_dummies(comb_df['age_bins'], prefix='age_range', drop_first=True)
comb_df = pd.concat([comb_df, age_dummies], axis=1)
comb_df.drop(columns='age_bins', inplace=True)

print(comb_df.shape)
comb_df.head()

### 3. Driving_License

In [None]:
comb_df['Driving_License'].value_counts()

### Less than 1% of the samples are individuals without a license. People without a drivers license would not usually be interested in vehicle insurance, therefore I'm going to check to see if any of these individuals actually have a positive response:

In [None]:
len(train[(train['Driving_License']  == 0) &
          (train['Response']  == 1)])

### Because only 37 of the people without licenses are actually wanting insurance, I don't see much use for this feature without something else to go along with it. Such as a feature that asks if a person will be getting a new vehicle soon (or a drivers license). Therefore, I opt to remove it.

In [None]:
comb_df.drop(columns='Driving_License', inplace=True)

### 4. Region_Code

In [None]:
comb_df['Region_Code'].unique()

### There are many different regions being represented in this data, the values are NOT ordinal, therefore I am going to use one hot encoding (i.e. dummy values) for each region. This will considerably add to the number of features, but should prevent any erroneaous calculations by sklearn.

In [None]:
# Create 'dummy' values and combine them with data
rc_dummies = pd.get_dummies(comb_df['Region_Code'], prefix='RC', drop_first=True)
comb_df = pd.concat([comb_df, rc_dummies], axis=1)
comb_df.drop(columns='Region_Code', inplace=True)
comb_df.shape

### 5. Previously_Insured

In [None]:
comb_df['Previously_Insured'].value_counts()

### 'Previously_Insured' has a pretty even split between negative and positive responses and  already has values of 0 and 1, so no further action is required on my end.

### 6. Vehicle_Age

In [None]:
comb_df['Vehicle_Age'].value_counts()

### There are three metrics for Vehicle_Age and I will change the values like so:
* Between 1-2 Years : 1
* Less than 1 Year : 0
* Greater than 2 Years : 2

In [None]:
# Create Ordinal Values
# Note: this is done because XGBoost doesn't allow < and > in
#       in feature names
comb_df['Vehicle_Age'].replace({'1-2 Year' : 1,
                                '< 1 Year' : 0,
                                '> 2 Years' : 2}, inplace=True)

# One-Hot-Encoding
vehicle_age_dummies = pd.get_dummies(comb_df['Vehicle_Age'], prefix='v_age', drop_first=True)
comb_df = pd.concat([comb_df, vehicle_age_dummies], axis=1)
comb_df.drop(columns='Vehicle_Age', inplace=True)
print(comb_df.shape)

### 7. Vehicle_Damage 

In [None]:
comb_df['Vehicle_Damage'].value_counts()

In [None]:
# Convert Vehicle_Damage values
comb_df['Vehicle_Damage'].replace({'No' : 0,
                                   'Yes' : 1}, inplace=True)

### 8. Annual_Premium

In [None]:
comb_df['Annual_Premium'].describe()

In [None]:
comb_df[comb_df['Annual_Premium'] > 100000]

In [None]:
sns.boxplot(y=comb_df['Annual_Premium'], data=comb_df)

### There is a wide disburtion of annual premium amounts with the high end (>100k) being outliers. I could handle this in a few ways:
1. Remove the outliers (not ideal)
2. Create bins for value ranges (better)
3. Do nothing and scale the values

I am going to start by created the different ranges for the data like so:
* less than 30k : 0
* between 30k and 100k : 1
* greater than or equal to 100k: 2

In [None]:
# Creating Bins for annual premium value ranges

# Create Ordinal Values
comb_df.loc[comb_df['Annual_Premium'] < 30_000, 'Annual_Premium'] = 0
comb_df.loc[(comb_df['Annual_Premium'] >= 30_000) & (comb_df['Annual_Premium'] < 100_000) , 'Annual_Premium'] = 1
comb_df.loc[comb_df['Annual_Premium'] > 100_000, 'Annual_Premium'] = 2

# One-Hot-Encoding
yr_prem_dummies = pd.get_dummies(comb_df['Annual_Premium'], prefix='yr_prem', drop_first=True)
comb_df = pd.concat([comb_df, yr_prem_dummies], axis=1)
comb_df.drop(columns='Annual_Premium', inplace=True)
print(comb_df.shape)

## 9. Policy_Sales_Channel 	

In [None]:
comb_df['Policy_Sales_Channel'].unique()

### Similar to 'Region_Code', the 'Policy_Sales_Channel' feature is numerical but the values are NOT ordered. Therefore I will need to convert this by using one hot encoding. This will create a large number of new features.

In [None]:
# One-Hot-Encoding
rc_dummies = pd.get_dummies(comb_df['Policy_Sales_Channel'], prefix='PSC', drop_first=True)
comb_df = pd.concat([comb_df, rc_dummies], axis=1)
comb_df.drop(columns='Policy_Sales_Channel', inplace=True)
comb_df.shape

### 10. Vintage
This metric represents the number of days a customer has been insured up until now and is therefore ordinal in nature.

In [None]:
comb_df['Vintage'].hist()

### Because these values are ordinal, I could just leave this featuer as-is, however because the value ranges are so evenly distributed I'm going to create bins for this as well using 50 day increments.

In [None]:
# Create Bins
comb_df.loc[comb_df['Vintage'] < 50, 'Vintage'] = 1
comb_df.loc[(comb_df['Vintage'] >= 50) & (comb_df['Vintage'] < 100)  , 'Vintage'] = 2
comb_df.loc[(comb_df['Vintage'] >= 100) & (comb_df['Vintage'] < 150) , 'Vintage'] = 3
comb_df.loc[(comb_df['Vintage'] >= 150) & (comb_df['Vintage'] < 200) , 'Vintage'] = 4
comb_df.loc[(comb_df['Vintage'] >= 200) & (comb_df['Vintage'] < 250) , 'Vintage'] = 5
comb_df.loc[(comb_df['Vintage'] >= 250) & (comb_df['Vintage'] < 300) , 'Vintage'] = 6

In [None]:
comb_df['Vintage'].value_counts()

In [None]:
# One-Hot-Encoding
vintage_dummies = pd.get_dummies(comb_df['Vintage'], prefix='vintage', drop_first=True)
comb_df = pd.concat([comb_df, vintage_dummies], axis=1)
comb_df.drop(columns='Vintage', inplace=True)
print(comb_df.shape)
comb_df.head()

print(comb_df.shape)

---
## Modeling 
At this point the data should be good to go for traning and testing purposes however I must first re-separate the training and testing datasets now that I am done with all the editing.

In [None]:
# Creating Training, Testing, and Target variables 
X_train = comb_df[: train.shape[0]].drop(columns='id')
y = train['Response']

X_test = comb_df[train.shape[0] :].drop(columns='id')
X_test_ids = test['id']

In [None]:
def run_model(x, y, name, t):
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
    print(X_train.shape)
    print(X_test.shape)
    
    if name == 'logistic_bal':
        model = LogisticRegression(max_iter=1000,
                                   solver='liblinear',
                                   class_weight='balanced')
    elif name == 'logistic':
        model = LogisticRegression(max_iter=1000,
                                   solver='liblinear')
    elif name == 'xgb':
        model = XGBClassifier(scale_pos_weight=19.59)
    else:
        print('Error, Incorrect Model')

    # Cross-Validation method 1:  cross_val_predict()
    cv_pred = cross_val_predict(model, X_train, y_train, cv=5)
    print(f'Training Data CV Score Method 1: {np.round(metrics.accuracy_score(y_train, cv_pred),4) * 100}%') 
        
    # Cross-Validation method 2:  cross_val_score()
    kfold = StratifiedKFold(n_splits=5,shuffle=True,random_state=1)
    cv_result = cross_val_score(model, X_train, y_train, cv=kfold, scoring="accuracy")
    print(f'Training Data CV Score Method 2: {np.round(cv_result.mean(),4) * 100}%')

    # Fit the Model
    model.fit(X_train, y_train)
    
    # Printed Results
    if t == 'train':
        # Classification Report
        y_pred = model.predict(X_test)
        print(f'Testing Data Accuracy Score: {np.round(metrics.accuracy_score(y_test, y_pred), 4) * 100}%')
        print(f'\n{name} Test Prediction Classification Report:')
        print('------------------------------------------------------------')
        print(metrics.classification_report(y_test, y_pred))

        # Confustion Matrix Heat Map
        sns.heatmap(metrics.confusion_matrix(y_test,y_pred), annot=True, fmt=".0f")
        plt.title(f'{name} confustion matrix')
        plt.xlabel('Predicted Values')
        plt.ylabel('Actual Values')
        plt.show()
    else:
        return model

---
# Testing Different Models
Note: in my actual analysis I tested many different models. However, for this notebook I only kept logistic regression and XGBoost as they had the best results. 


## Unbalanced Logistic Regression

In [None]:
run_model(X_train, y, 'logistic', 'train')

### Basic unbalanced Logistic Regression performs ok on paper with a score of 84%, however it is misclassifying the positive responses and only gets roughly 36% correct which is unacceptable.

---
## Balanced Logistic Regression

In [None]:
run_model(X_train, y, 'logistic_bal', 'train')

### Balanced Logistic Regression drasticaly improves the prediction success of positive responses with a 92% success rate. However, the negative responses are now getting classified incorrectly more often. Still the success rate of the negative classifications is 74%.

In [None]:
run_model(X_train, y, 'xgb', 'train')

### XGBoost further improved the postive responses to a 98% success rate, however the negative missclassifications increased even more than the balanced logistic regression

---
## Conclusions

### The goal of this task was to predict whether a customer would be interested in Vehicle Insurance. In this analysis I used some basic models with very little tweaking. The best results came from using Balanced Logistic Regression where I was able to:
* Achieve an approximate 92% success rate on predicting customers who  WILL purchase insurance


* Acheive an approximate 74% success rate on predicting customers who WILL NOT purchase insurance


### From a business and advertising costs perspective, if the revenue from new customers is greater than the costs from the increased number of missclassied customers then the Balanced Logistic Regression model would work.

---

## Running the model on the testing data

In [None]:
# Selecting and traning ideal model (Balanced Logistic Regression)
best_model = run_model(X_train, y, 'logistic_bal', 'test')

In [None]:
# Get predictions for test data
test_predictions = best_model.predict(X_test)
test_predictions

In [None]:
data = {'id'  : X_test_ids.to_numpy(),
        'response' : test_predictions}


final_df = pd.DataFrame(data, columns = ['id','response'])

final_df.head()

In [None]:
final_df['response'].value_counts()

In [None]:
final_df.to_csv('submission.csv', index=False)