In [None]:
# Preparing My Work Environemnet 
# 1. Import Data Analysis & Visulaization Tools 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline 
# 2. Import Machine Learning Models 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, classification_report
# 3. Import Other Assisting Libraries 
import math        #For mathematical operations 
import time        #For time calculations 
import joblib      #For Saving & Loading My final model 
import warnings    #For Dealing with system warnings 
import os          #For Working with operating system  
# 4. Import Work DataSets 
df_train = pd.read_csv('../input/imbalanced-data-practice/aug_train.csv')
df_train.head()

In [None]:
df_train.info() # Get more information about the data 

## The Analysis & Prediction Strategy 
1. Clean the data :      
    a. Ensure that there are no null values (Checked)    
    b. Check for duplicate values    
    c. Turn all categorical data into numerical 
2. Choose the important features for prediction and eliminate all useless ones. 
3. Applying Machine Learning Models to the training data and choose the best estimator. 
4. Evaluating & Improving the best estimator predictions (Tuning Hyperparameters). 
5. Saving the model.
6. Applying all the above on test set to make predictions. 
7. Send to "kaggle.com". 

In [None]:
# 1.b Check for duplicate values 
if len(df_train) == len(df_train['id'].unique()):
    print ('There are no duplicated values .. You may proceed')
else : 
    print ('There\'s Duplication!! Please Check!!')

In [None]:
# 1.c Turn all categorical values into numerical 
df = df_train.copy()
for label, content in df.items(): 
    if pd.api.types.is_string_dtype(content): 
        df[label] = pd.Categorical(content.astype('category')).codes 
df.head()

In [None]:
# 2. Choosing the important features 
# The most clear way to view importance of features is throw visulaization of correlation 
fig, ax = plt.subplots(figsize=(20,10))
ax = sns.heatmap(df.corr(),cmap = 'YlGnBu', annot = True, fmt = '.2f', linewidths=1)
fig.savefig('Correlation Heatmap.png')

In [None]:
# Another way than the heatmap for calculating feature importance is through bar graph 
y = df.corr()['Response']
x = df.columns
fig, ax = plt.subplots(figsize=(20,10))
ax = plt.barh(x,y)
fig.savefig('Correlation (Bars).png')

In [None]:
imp_feats = ['id', 'Gender', 'Age','Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Annual_Premium',
       'Policy_Sales_Channel']

In [None]:
# Apply Machine learning algorithms & Choosing best estimator 
np.random.seed(30)
x, y = df[imp_feats], df['Response']
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.1)
models = {'Log_Regression' : LogisticRegression(),
         'KNN' : KNeighborsClassifier(),
          'DecisionTree' : DecisionTreeClassifier(criterion= 'gini', max_depth= 8, max_features= 'log2', min_samples_leaf= 2, min_samples_split= 4)}
def fit_and_score (x_train, x_val, y_train, y_val, models): 
    for label, model in models.items() :
        start = time.time()
        print (f'{label} Model has started training .....')
        model.fit(x_train, y_train)
        y_preds = model.predict(x_val)
        print (f'{label} Model has finalized training in {(time.time()-start):.2f} seconds with roc auc score of {(roc_auc_score(y_val, y_preds)):.2f}')
        print (f'Also Here is the classification Report !!')
        print (classification_report(y_val, y_preds))
        print (f'<----------------------------------------------->')
fit_and_score(x_train, x_val, y_train, y_val, models)

My Main choice would be the decision tree .. for the following : 
- It gives better recall in positive predictions meaning less amount of people predicted false audience while they are actual customers (Better roc_auc_score)

In [None]:
"""# Tuning HyperParameters 
np.random.seed(30)
start = time.time()
gs_params = {'criterion' : ['gini','entropy'],
            'max_depth' : np.arange(6,15), 
            'min_samples_split' : np.arange(2,10),
            'min_samples_leaf' : np.arange(2,10),
            'max_features' : ['auto', 'sqrt', 'log2']}
dtc = DecisionTreeClassifier()
gs_dtc = GridSearchCV(dtc, param_grid=gs_params, cv=10, scoring = 'recall', verbose= 10 )
gs_dtc.fit (x, y)
filename = 'finalized_tree_model.pkl'
joblib.dump(gs_dtc.best_estimator_, filename)
print ('Tree Model has been Saved to my directory...')
"""