In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from IPython.display import display # Allows the use of display() for DataFrames
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [2]:
# loading data
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

In [3]:
print("training set shape: ",train.shape)
print("testing set shape: ",test.shape)

**Analysis**

* **Data exploration:**

In [4]:
# basic statistics
train.describe()

In [5]:
# Due to the high number of features (i.e. 370), it will be focused on a sample of features.
train[['var3', 'var15', 'num_var46', 'saldo_var27', 'var38','TARGET']].describe()

In [6]:
# # finding outliers in features
# def find_outliers(df):
#     outliers = {}
#     for feature in df.columns.values:
#         q1 = df[feature].quantile(0.25)
#         q3 = df[feature].quantile(0.75)
#         iqr = q3-q1 #Interquartile range
#         fence_low  = q1-1.5*iqr
#         fence_high = q3+1.5*iqr
#         count = count_outliers(df[feature], fence_low, fence_high)
#         outliers.update({feature: count})
#     return outliers
# def count_outliers(df, low, high):
#     count = 0
#     for v in df.values:
#         if v > high or v < low:
#             count +=1
#     return count

# result = find_outliers(train)
# print('done')

In [7]:
# # Sorting all features based on amount of outliers.
# outliers_result = {}
# features = sorted(result)
# values = sorted(result.values())

# for f,v in zip(features, values):
#     outliers_result.update({f:v})

# for k,v in outliers_result.items():
#     print(k, " : ", v)

Using feature sample above, it found that:
*  feature (var3) has 15281 outliers
*  feature (var15) has 14196 outliers
*  feature (num_var46) has 3221 outliers
*  feature (saldo_var27) has 11254 outliers
*  feature (var38) has 15983 outliers
*  feature (TARGET) has 0 outliers

Removing all features outliers will reduce the data-set dramatically, so this issue should be handled by using algorithms that are robust to outliers. 


**Exploratory Visualization**

In [8]:
# counting each class in TARGET variable.
# 0: un-satisfied, 1: satisfied
train['TARGET'].value_counts()

In [9]:
%matplotlib inline
# a plot visualizes the imbalance in TARGET variable. 
train['TARGET'].plot(kind='hist')
plt.show()

In [10]:
%matplotlib inline
# a plot visualizes the distribution of features 'var3', 'var15', 'num_var46', 'saldo_var27', 'var38','TARGET'.
train[['var3', 'var15', 'num_var46', 'saldo_var27', 'var38','TARGET']].hist(figsize=(12, 6),  bins=100)
plt.show()

In [11]:
%matplotlib inline
# box plot to visualize the outliers of 'var3', 'var15', 'num_var46', 'saldo_var27', 'var38','TARGET'.
train[['var3', 'var15', 'num_var46', 'saldo_var27', 'var38','TARGET']].plot(kind='box', subplots=True, figsize=(12, 6))
plt.show()

**Data setup**

- Seperate the labels.
- Drop TARGET, ID variables from training set.
- Drop ID variables from testing set.

In [12]:
# Dropping target and ids from traning and testing data-sets
labels =  train['TARGET']
ids = test['ID']
train.drop(['TARGET', 'ID'], axis = 1, inplace = True)
test.drop(['ID'], axis = 1, inplace = True)
print(train.shape, '=', test.shape)

**Data Preprocessing**

- Remove constant columns, handle missing values and apply PCA on training and testing sets

In [13]:
# 1st setp: remove constant columns in training data
remove = []
for col in train.columns:
    if train[col].std() == 0:
        remove.append(col)
print(" >> Constant columes: ",remove)
train.drop(remove, axis=1, inplace=True)
print('>> train shape after removing constant columns: ', train.shape)

In [15]:
# 2nd step: handle missing values in training set
print((train == -999999).sum())

In [16]:
# var3 has 116 missing values.
# one way to handle this issue by replacing these values with the mode of var3
train['var3'].replace(-999999,train['var3'].mode())

In [17]:
# 3rd step: Scaling the training data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train)
scaled_train = scaler.transform(train)

In [18]:
# Scaling the test data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(test)
scaled_test = scaler.transform(test)

**Pipe Line**

Build a pipeline to training, test a model and plot ROC curve

In [20]:
# Creating a Training and Predicting Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import time
def train_predict(learner, X_train, y_train, X_test, y_test): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - X_train: features training set
       - y_train: labels training set
       - X_test: features testing set
       - y_test: labels testing set
    '''
    
    results = {}
    start = time.time() # Get start time
    learner = learner.fit(X_train, y_train)
    end = time.time() # Get end time 
    # Calculate the training time
    results['train_time'] = end - start
    
    start = time.time() # Get start time
    predictions_test = learner.predict_proba(X_test)[:,1]
    end = time.time() # Get end time 
    # Calculate the total prediction time
    results['pred_time'] = end - start
    
    # Compute roc_auc_score
    results['roc_auc_score'] = roc_auc_score(y_test, predictions_test)
    
    # plot roc_curve
    fpr, tpr, thresholds = roc_curve(y_test, predictions_test)
    plt.figure()
    plt.plot(fpr, tpr, label='%s (area = %0.5f)' % (learner.__class__.__name__, results['roc_auc_score']))
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig('Log_ROC')
    plt.show()
    
    # Success
    print("{} is trained.".format(learner.__class__.__name__))
        
    # Return the results
    return results

**Benchmark**

The benchmark model is *LogisticRegression* on  data using all the features.

In [22]:
from sklearn.cross_validation import train_test_split

# Split the unscaled traning data into training and testing sets
# Reserve the last 15204 raw of traning set for final testing
bX_train, bX_test, by_train, by_test = train_test_split(train[:60817], labels[:60817], test_size = 0.2, random_state = 0)

# Split the scaled traning data into training and testing sets
# Reserve the last 15204 raw of traning set for final testing
bsX_train, bsX_test, bsy_train, bsy_test = train_test_split(scaled_train[:60817], labels[:60817], test_size = 0.2, random_state = 0)

In [23]:
# Test the benchmark model using unscaled data
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()
Logistic_results =  train_predict(lg, bX_train, by_train, bX_test, by_test)

print('Benchmark results using unscaled data : ', Logistic_results)

In [24]:
# using scaled data
lg2 = LogisticRegression()
Logistic_results2 =  train_predict(lg2, bsX_train, bsy_train, bsX_test, bsy_test)

print('Benchmark results using scaled data : ',Logistic_results2)

**Implementation**

- Apply PCA on training and testing sets.
- Test RondomForest using scaled traing data and reduced data using PCA.
- Test XgBoost using scaled traing data and reduced data using PCA.

In [25]:
# applay PCA for dimensionality reduction and speed-up ML algorithm by selecting the important features on training and testing sets.
from sklearn.decomposition import PCA
pca = PCA(n_components=90)
pca.fit(scaled_train)

In [26]:
print('components: ', pca.n_components_)

In [27]:
print('explained variance', sum(pca.explained_variance_ratio_))

In [28]:
# applying transformation to the training data
t_train = pca.transform(scaled_train)

print(' training set after transformation: ',t_train.shape)

In [29]:
# applay PCA for testing sets.
from sklearn.decomposition import PCA
pca2 = PCA(n_components=90)
pca2.fit(scaled_test)

In [30]:
t_test = pca2.transform(scaled_test)
print('components: ', pca2.n_components_)
print('explained variance', sum(pca2.explained_variance_ratio_))
print('testing set after transformation: ',t_test.shape)

In [31]:
from sklearn.cross_validation import train_test_split

# Split the scaled traning data into training and testing sets
# Reserve the last 15204 raw of traning set for final testing
X_train, X_test, y_train, y_test = train_test_split(scaled_train[:60817], labels[:60817], test_size = 0.2, random_state = 0)

# Split the reduced traning data using PCA into training and testing sets
# Reserve the last 15204 raw of traning set for final testing
tX_train, tX_test, ty_train, ty_test = train_test_split(t_train[:60817], labels[:60817], test_size = 0.2, random_state = 0)

**1st model: Random Forest**

In [32]:
# 1st model : Random Forest
# Using scaled data and reduced data
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, max_depth=10)
forest_results_scaled =  train_predict(clf, X_train, y_train, X_test, y_test)

print('Random Forest result with scaled data: ', forest_results_scaled)

In [33]:
forest_results_reduced =  train_predict(clf, tX_train, ty_train, tX_test, ty_test)
print('Random Forest result with reduced data: ',forest_results_reduced)

**2nd model: Xgboost**

In [34]:
from xgboost import XGBClassifier

xgc = XGBClassifier(subsample=0.8, n_estimators=100, min_child_weight=10, max_depth=5, gamma=0.5, colsample_bytree=0.8)
Xgboost_results_scaled =  train_predict(xgc, X_train, y_train, X_test, y_test)
print('Xgboost results with scaled: ', Xgboost_results_scaled)

In [35]:
Xgboost_results_reduced =  train_predict(xgc, tX_train, ty_train, tX_test, ty_test)
print('Xgboost results with reduced: ', Xgboost_results_reduced)

**XgBoost model gets higher score using scaled and reduced data, so it will be tunned to get better preformance**

**Xgboost tuning**

As XgBoost gets higher score using default parameters, GridSearchCV will be exhausted search that takes alot of time. Instead, *RandomizedSearchCV*  has a probability of 95% of finding a combination of parameters within the 5% optima with only 60 iterations. Also compared to other methods, it doesn't bog down in local optima.

In [39]:
# from sklearn.model_selection import StratifiedKFold
# from sklearn.model_selection import RandomizedSearchCV

# xgc2 = XGBClassifier()
# # A parameter grid for XGBoost
# params = {
#         'n_estimators' : [100, 200, 300],
#         'min_child_weight': [1, 5, 10],
#         'gamma': [0.5, 1, 1.5, 2, 5],
#         'subsample': [0.6, 0.8, 1.0],
#         'colsample_bytree': [0.6, 0.8, 1.0],
#         'max_depth': [3, 4, 5]
#         }
# kfold = StratifiedKFold(n_splits=3)
# random_search = RandomizedSearchCV(xgc2, param_distributions=params, scoring='roc_auc', n_jobs=4, cv=kfold, verbose=5, n_iter=50)

# start_time = time.time()
# random_search.fit(tX_train, ty_train)
# print('>> Tuning time: ',time.time()-start_time)



In [40]:
# print('\nBest estimator:')
# print(random_search.best_estimator_)
# print("\nscore :")
# print(random_search.best_score_ )
# print('\n Best hyperparameters:')
# print(random_search.best_params_)

In [41]:
# tunned_model_results = train_predict(random_search.best_estimator_, tX_train, ty_train, tX_test, ty_test)
# print('tunned Xgboost results with reduced: ', Xgboost_results_reduced)

In [42]:
# untunned Xgboost parameters
# xgc

**Final Prediction**

- Testing the tunned model with last 15204 rows of *reduced training* set as tesing set.
- Compare the result with Benchmark model prediction on last 15204 rows of *scaled traing set*.

In [37]:
# def get_final_score(learner, X_test, y_test):
#     start = time.time() # Get start time
#     predictions_test = learner.predict_proba(X_test)[:,1]
#     end = time.time() # Get end time 
#     print('%s gets score of %0.5f , prediction time = %0.2f' % (learner.__class__.__name__, roc_auc_score(y_test, predictions_test),  end - start) ) 

In [43]:
# get_final_score(random_search.best_estimator_, t_train[60817:], labels[60817:])
# get_final_score(lg2, scaled_train[60817:], labels[60817:])

**Using the final tunned model on original Kaggle competition test set**

In [44]:
results = xgc.predict_proba(t_test)[:,1]
submission = pd.DataFrame({'ID': ids, 'TARGET':results})
submission.to_csv('../working/submission16.csv', index=False)
print('done')