### Insurance Data Exploration
***
.

In [None]:
# IMPORT 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
#! head -n 3 ../input/exercise_02_train.csv

In [None]:
insu = pd.read_csv('../input/exercise_02_train.csv')

In [None]:
insu.head(2).T

In [None]:
insu.y.value_counts().plot(kind='bar')

In [None]:
insu.info(verbose=True, max_cols=101)

In [None]:
insu.head(1).T

### Handling Columns of type object
***
- Cleaning
- Standardizing Names
- Value count bar-plots

In [None]:
# CATEGORICAL COLUMNS
insu.loc[:,insu.dtypes==object].head()

#### Object-type Columns are
- x34: Brand
- x35: Day
- x41: Money amount
- x45: Some %-age
- x68: Month
- x93: Country
***
We need to clean some columns.

In [None]:
# Remove the $ symbol
insu['x41'] = insu['x41'].str.replace('$','').astype(float)

In [None]:
#Remove the % symbol
insu['x45'] = insu['x45'].str.replace('%','').astype(float)

In [None]:
insu.loc[:,insu.dtypes==object].head()

In [None]:
insu['x34'].value_counts().plot(kind='barh')

In [None]:
# Make all brand names lowercase
insu['x34'] = insu['x34'].str.lower()

In [None]:
insu['x35'].value_counts().plot(kind='barh')

In [None]:
s1 = insu['x35']

In [None]:
# Standardize the day names
insu['x35'] = s1.replace({'monday':'mon', 'tuesday':'tue', 'wednesday':'wed',
        'thurday':'thu', 'thur':'thu','friday':'fri'})

In [None]:
insu['x35'].value_counts().plot(kind='barh')

In [None]:
insu.loc[:,insu.dtypes==object].head()

In [None]:
insu['x68'].value_counts().sort_values().plot(kind='barh')

In [None]:
insu['x68'] = insu['x68'].str.lower()

In [None]:
#Standardize the month names
insu['x68'] = insu['x68'].replace({'january':'jan', 'dev':'dec', 'sept.':'sep',
        'july':'jul'})

In [None]:
insu['x93'].value_counts().sort_values().plot(kind='barh')

In [None]:
insu.loc[:,insu.dtypes==object].head()

### Missing data

In [None]:
# Look at missing rows
insu[insu.isnull().any(axis=1)].shape

In [None]:
#Drop rows with missing data
insu.dropna(how='any', inplace=True)

In [None]:
# Look at missing rows AGAIN
insu[insu.isnull().any(axis=1)].shape

In [None]:
insu.x0.plot(kind='hist')

In [None]:

cols = insu.columns
insu.boxplot(column=['x0', 'x1', 'x2'])

### Handle Outliers
*** 
We will return on this in next class

### Encode categorical columns using get_dummies

In [None]:
target = insu.y
insu.drop('y', axis=1, inplace=True)

In [None]:
insu2 = pd.get_dummies(insu, columns=['x34', 'x35', 'x68', 'x93'])

In [None]:
insu2.head()

#### Prepare data for modeling

In [None]:
X = insu2.values
y = target.values

In [None]:
corr = np.corrcoef(X.T,y)

In [None]:
sns.heatmap(data = corr,vmin=0, vmax=1)

In [None]:
# Split the data for train and dev/test purpose 
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, y,test_size=0.20, 
                                                    random_state=10)

In [None]:
#Normalizer or Standardized
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
Xn_train = scaler.transform(X_train)
Xn_test = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegressionCV

#### Logistic Regression

In [None]:
lrm = LogisticRegressionCV(tol=0.0001)
lrm.fit(Xn_train, Y_train)

In [None]:
print("Test Accuracy: ", 100*lrm.score(Xn_test, Y_test))

In [None]:
from sklearn.metrics import classification_report
print ("CLASSIFICATION REPORT:\n")
print (classification_report(Y_test, lrm.predict(Xn_test)))

#### Adaboost an ensemble of Logistic Regression Classifiers

In [None]:
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier(base_estimator=LogisticRegressionCV())
abc.fit(Xn_train, Y_train)
print("Test Accuracy: \n", 100*abc.score(Xn_test, Y_test))
print( "CLASSIFICATION REPORT:\n")
print (classification_report(Y_test,abc.predict(Xn_test)))

**Work in progress....**