# Statefarm Classification MDL Test

## import

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

from sklearn.impute import SimpleImputer

from sklearn import preprocessing
from sklearn.preprocessing import scale

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

import pickle

import re
import math

%matplotlib inline

## Define functions

In [2]:
"""
  Module name: negative_converter 
  Purpose: convert Excel negative value to Pandas negative value
  - ref: https://stackoverflow.com/questions/53989077/how-to-convert-excel-negative-value-to-pandas-negative-value
  Parameter
   x: feature
"""

def negative_converter(x):
    # a somewhat naive implementation
    x = x.replace('$', '').replace(',', '')
    if '(' in x:
        x = '-' + x.strip('()')
    return float(x)


In [26]:
"""
 -  Remove % from X7 and convert datatype from object to floating 
 -  x3: Consolidate days of the week
     - Rename full name to abbreviation, e.g.; 'Monday'-> 'Mon'
"""
def data_clean(df):       
    df['x7'] = list(map(lambda x: float(re.sub('\%','',x))/100, df['x7'] ))
    
    sorted(df.x3.unique())
    d_day= {
    'Monday'   :'Mon',
    'Tuesday'  :'Tue',   
    'Wednesday':'Wed',
    'Friday'   :'Fri', 
    'Thursday' :'Thu', 
    'Saturday' :'Sat', 
    'Sunday'   :'Sun'}  
    for key, value in d_day.items():
        df.x3.replace(key, value, inplace = True)

In [4]:
"""
- The df_train will use target encoding to convert categorical features to numeric features 
- The df_test doesn't have target variable and can't use target encoding for the conversion 
- Will use the function 'tar_enc_test_data' to simulate the  target encoding for the df_test
  - Build a dictionary related to target encoding from df_train 
  - Apply the values from the dictionary to df_test

"""
def tar_enc_test_data(df_train, df_test, feature):
    encoding = df_train.groupby([feature])['y'].mean().to_dict()
    df_test[feature]  = df_test[feature].map(encoding)

In [5]:
""" 
Impute missing value for numeric features
"""
def impute_num(df, feature):    
    imputer = SimpleImputer(strategy='mean')
    df[feature] = imputer.fit_transform(df[feature].values.reshape(-1,1))[:,0]


# Main process starts here

## Read data

In [35]:
"""
Apply 'negative_converte' to convert Excel negative value to Pandas negative value
"""
df_train = pd.read_csv('exercise_40_train.csv', converters={'x19': negative_converter})
df_train.x19.head()
df_test = pd.read_csv('exercise_40_test.csv', converters={'x19': negative_converter})
df_test.x19.head()

0     120.22
1    -267.56
2    -311.29
3    2229.15
4    -469.05
Name: x19, dtype: float64

### Drop features 

In [36]:
var_drop_high_missing_value = ['x30', 'x44', 'x52', 'x55', 'x57']
var_drop_cat =  ['x24', 'x39']
var_drop_num =  ['x58', 'x59', 'x67', 'x71', 'x79', 'x84', 'x98']
var_drop = var_drop_high_missing_value  + var_drop_cat + var_drop_num
df_test = df_test.drop(var_drop, axis = 1)
df_test.shape

(10000, 86)

### Data clean

In [37]:
data_clean(df_train)  # for x3 and x7
data_clean(df_test)  # for x3 and x7

### Encoding for categhorical features

In [38]:
num_fea = df_test.select_dtypes([np.number]).columns
cat_fea = df_test.select_dtypes(exclude=[np.number]).columns

#Impute categorical features for df_train
for feature in cat_fea:
    df_train[feature] =  df_train[feature].fillna('missing') 
    
#Impute categorical features for df_train
for feature in cat_fea:
    df_test[feature] =  df_test[feature].fillna('missing')     

# Simulate target encoding for df_test
for feature in cat_fea:
    tar_enc_test_data(df_train, df_test, feature)

df_test.select_dtypes(exclude=[np.number]).columns

Index([], dtype='object')

### Impute numeric features

In [39]:
nr = lambda df, feature : df[feature].isnull().sum()/len(df) # Null value ratio
for feature in num_fea:
    if nr(df_test, feature) > 0:  
       impute_num(df_test, feature)

feature x5
feature x11
feature x14
feature x16
feature x22
feature x26
feature x38
feature x41
feature x42
feature x45
feature x49
feature x54
feature x61
feature x63
feature x64
feature x68
feature x74
feature x75
feature x76
feature x78
feature x80
feature x83
feature x85
feature x86
feature x88
feature x89
feature x91
feature x92
feature x94
feature x95
feature x96


### PCA - Principal Components Analysis
- Applied PCA to reduce the features from 86 to 60 and transform the df_train

In [42]:
X =  scale(df_test, with_mean=True, with_std=True)
#X =  scale(df_test)

In [43]:
X_test_pca = PCA(n_components= 60).fit_transform(X)

### Load LogisticRegression model
    - pkl as clf
    - Get probablity of x_test_pca
    

In [48]:
with open('clf.pkl', 'rb') as f:
     clf = pickle.load(f)

# Predict X_test_pca
pred_clf = clf.predict_proba(X_test_pca)
np.savetxt('glmresults.csv', pred_clf[:, 1], delimiter=',')

### Load XGBClassifier model
- pkl as xgb
- Get probablity of x_test_pca

In [60]:
with open('xgb.pkl', 'rb') as f:
     xgb = pickle.load(f)

# Predict X_test_pca
pred_xgb = xgb.predict_proba(X_test_pca)  # numpy.ndarray
np.savetxt('nonlmresults.csv', pred_xgb[:, 1], delimiter=',')

## wip

In [51]:
pred_clf[0:10]

array([[9.99999675e-01, 3.25214201e-07],
       [9.99998204e-01, 1.79598703e-06],
       [9.99993024e-01, 6.97646506e-06],
       [9.88545446e-01, 1.14545537e-02],
       [9.99999867e-01, 1.33323479e-07],
       [9.99741115e-01, 2.58885486e-04],
       [1.79487532e-03, 9.98205125e-01],
       [5.68093408e-01, 4.31906592e-01],
       [1.00000000e+00, 3.92005889e-11],
       [9.99891305e-01, 1.08694832e-04]])

In [61]:
pred_xgb[0:10]

array([[9.9995285e-01, 4.7128728e-05],
       [9.9996591e-01, 3.4095690e-05],
       [9.9999821e-01, 1.7789699e-06],
       [9.9998206e-01, 1.7944254e-05],
       [9.9999619e-01, 3.7934526e-06],
       [9.9999863e-01, 1.3700972e-06],
       [5.9861887e-01, 4.0138116e-01],
       [2.1081132e-01, 7.8918868e-01],
       [9.9999428e-01, 5.7169291e-06],
       [9.9999857e-01, 1.4098258e-06]], dtype=float32)

In [54]:
#pred_clf[:, 1].to_dataframe.to_csv('glmresults.csv')
pred_clf[:, 1].savetxt('glmresults.csv')

AttributeError: 'numpy.ndarray' object has no attribute 'savetxt'

In [56]:
np.savetxt('nonlmresults.csv', pred_xgb[:, 1])

NameError: name 'pred_xgb' is not defined

In [55]:
np.savetxt('glmresults.csv', pred_clf[:, 1], delimiter=',')

In [None]:
var = df_test.columns
var[0]

In [None]:
type(df_test.columns) # pandas.core.indexes.base.Index

In [None]:
df_test.columns.shape

In [None]:
a = 5

In [None]:
>> a

In [None]:
cat_fea

In [None]:
df_train.columns

In [15]:
cat_fea

Index(['x3', 'x7', 'x31', 'x33', 'x60', 'x65', 'x77', 'x93', 'x99'], dtype='object')

In [16]:
df_train.x3[0:10]

0    Wed
1    Fri
2    Thu
3    Tue
4    Sun
5    Sat
6    Thu
7    Sat
8    Wed
9    Tue
Name: x3, dtype: object

In [40]:
df_train.x99[0:10]

0        yes
1        yes
2        yes
3        yes
4        yes
5    missing
6        yes
7        yes
8        yes
9    missing
Name: x99, dtype: object

In [23]:
df_test.x99[0:10]

0         NaN
1    0.146407
2    0.146407
3         NaN
4    0.146407
5    0.146407
6    0.146407
7    0.146407
8    0.146407
9         NaN
Name: x99, dtype: float64

In [21]:
num_fea


Index(['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11',
       'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20', 'x21',
       'x22', 'x23', 'x25', 'x26', 'x27', 'x28', 'x29', 'x31', 'x32', 'x33',
       'x34', 'x35', 'x36', 'x37', 'x38', 'x40', 'x41', 'x42', 'x43', 'x45',
       'x46', 'x47', 'x48', 'x49', 'x50', 'x51', 'x53', 'x54', 'x56', 'x60',
       'x61', 'x62', 'x63', 'x64', 'x65', 'x66', 'x68', 'x69', 'x70', 'x72',
       'x73', 'x74', 'x75', 'x76', 'x77', 'x78', 'x80', 'x81', 'x82', 'x83',
       'x85', 'x86', 'x87', 'x88', 'x89', 'x90', 'x91', 'x92', 'x93', 'x94',
       'x95', 'x96', 'x97', 'x99', 'x100'],
      dtype='object')