In [1]:
# suppress pandas warnings
import warnings
warnings.simplefilter(action = "ignore", category = RuntimeWarning)
warnings.simplefilter(action = "ignore", category = FutureWarning)

# imports
from collections import defaultdict
import sys
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import pandas as pd
import numpy as np
from sklearn import preprocessing
#from sklearn.metrics import roc_curve, auc
from numpy.random import seed
#from scipy.special import cbrt
#import matplotlib.pyplot as plt
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix
#from scipy.stats import rankdata
import cPickle
%matplotlib inline

# reproduce results
seed(584)

In [2]:
train = pd.read_csv('data/train_FBFog7d.csv')
test = pd.read_csv('data/Test_L4P23N3.csv')
train.shape, test.shape

((10357, 18), (3387, 17))

In [3]:
alcohol = pd.read_csv('data/NewVariable_Alcohol.csv')
alcohol.shape

(13744, 2)

In [4]:
train = train.merge(alcohol, on='ID')
test = test.merge(alcohol, on='ID')
train.shape, test.shape

((10357, 19), (3387, 18))

In [5]:
categorical_vars = ['Var1', 'WorkStatus', 'Divorce', 'Widowed', 'Residence_Region', 'income', 'Engagement_Religion', 
                    'babies', 'preteen', 'teens', 'Var2', 'Gender', 'Unemployed10', 'Alcohol_Consumption']

In [6]:
numeric_vars = ['Education', 'TVhours', 'Score']

In [7]:
#merging rare levels
train.ix[train['babies'] >= 3, 'babies'] = 3
test.ix[test['babies'] >= 3, 'babies'] = 3

train.ix[train['preteen'] >= 4, 'preteen'] = 4
test.ix[test['babies'] >= 4, 'preteen'] = 4

train.ix[train['teens'] >= 3, 'teens'] = 3
test.ix[test['teens'] >= 3, 'teens'] = 3

In [8]:
#removing outliers
outlier_cutoff = 7
for feature in numeric_vars:
    train[feature + '_std'] = np.abs( (train[feature] - train[feature].mean()) / train[feature].std() )
    if len( train.ix[ train[ feature + '_std' ] > outlier_cutoff, feature ] ) > 0:
        print('removing outliers in ', feature, ':\n', train.loc[ train[ feature + '_std' ] > outlier_cutoff, feature ])
        train.ix[ train[feature + '_std'] > outlier_cutoff, feature ] = np.nan
    train.drop( [feature + '_std'], axis=1, inplace=True)

('removing outliers in ', 'TVhours', ':\n', 668     24
1884    22
1949    22
2261    24
3382    20
4033    20
4207    20
4734    20
5521    24
5556    20
6046    24
6409    21
7251    24
7732    20
Name: TVhours, dtype: float64)


In [9]:
#removing outliers
outlier_cutoff = 7
for feature in numeric_vars:
    test[feature + '_std'] = np.abs( (test[feature] - test[feature].mean()) / test[feature].std() )
    if len( test.ix[ test[ feature + '_std' ] > outlier_cutoff, feature ] ) > 0:
        print('removing outliers in ', feature, ':\n', test.loc[ test[ feature + '_std' ] > outlier_cutoff, feature ])
        test.ix[ test[feature + '_std'] > outlier_cutoff, feature ] = np.nan
    test.drop( [feature + '_std'], axis=1, inplace=True)

('removing outliers in ', 'TVhours', ':\n', 1484    21
1963    20
2019    20
2762    20
Name: TVhours, dtype: float64)


In [10]:
number = preprocessing.LabelEncoder()
for var in ['WorkStatus', 'Residence_Region', 'income', 'Engagement_Religion', 'Alcohol_Consumption','TVhours', 'Score']:
    train[var+'_encoded'] = number.fit_transform(train[var].astype('str'))
    test[var+'_encoded'] = number.fit_transform(test[var].astype('str'))

In [11]:
train['Happy'].value_counts()

Pretty Happy    5916
Very Happy      3146
Not Happy       1295
dtype: int64

In [12]:
train= train.fillna(-999)
test = test.fillna(-999)

In [13]:
data = train.copy()

label = data['Happy'].map({'Very Happy': 2, 'Pretty Happy': 1, 'Not Happy': 0})

dropCols = ['ID', 'Happy']
data.drop(dropCols, axis=1, inplace = True)

y = label
X = pd.get_dummies(data)

In [14]:
holdout_fold = StratifiedShuffleSplit(y, n_iter=1, test_size=0.25, random_state=0)
holdout_fold

StratifiedShuffleSplit(labels=[1 1 0 ..., 1 0 0], n_iter=1, test_size=0.25, random_state=0)

In [15]:
for train_index, holdout_index in holdout_fold:
    X_train, X_test = X.ix[train_index], X.ix[holdout_index]
    y_train, y_test = y[train_index], y[holdout_index]
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7767, 80), (2590, 80), (7767,), (2590,))

In [16]:
weights = [2 if data == 0 else 0.5 if data == 1 else 0.5 for data in y_train] 

In [17]:
columns = ['xgb1','xgb2','xgb3','xgb4','xgb5']
predictions = pd.DataFrame(index=X_test.index, columns=columns)

In [18]:
rounds = [100,300,500,800,1200]
etas = [0.003,0.01,0.03,0.1,0.5]
seeds = [584,585,586,587,588]

In [19]:
for (i,j,k,col) in zip(rounds,etas,seeds,columns):
    #XGB1
    #tuning the other parameters
    params = {}
    params["objective"] = "multi:softprob"
    #To avoid overfitting: The first way is to directly control model complexity
    params["min_child_weight"] = 6 #The larger, the more conservative the algorithm will be.
    params["max_depth"] = 5
    #params["gamma"] = 10 #The larger, the more conservative the algorithm will be.
    params["eta"] = j #higher is more conservative [0,1], if reduced then increase num_rounds
    #The second way is to add randomness to make training robust to noise
    params["subsample"] = 0.7
    params["colsample_bytree"] = 0.7

    #Handle Imbalanced Dataset
    #If you care only about the ranking order (AUC) of your prediction
    #params["scale_pos_weight"] = 1 #ratio of labels in target variable
    #params["eval_metric "] = 'mlogloss'
    #If you care about predicting the right probability
    #params["max_delta_step"]= 10 #should be high for skewed data

    params["seed"] = k
    params["silent"] = 1
    params["nthread"] = 16
    params["num_class"] = 3
    plst = list(params.items())
    num_rounds = i
    
    xgtrain = xgb.DMatrix(X_train, label=y_train, weight = weights, missing = -999)
    xgtest = xgb.DMatrix(X_test)
    model = xgb.train(plst, xgtrain, num_rounds)
    pred_ytest = model.predict(xgtest)
    predictions[col] = np.argmax(pred_ytest.reshape( y_test.shape[0], 3 ), axis=1) 

In [20]:
predictions['y_test'] = y_test
predictions.to_csv('data/xgb_hold.csv')

In [21]:
#For test
test2 = test.copy()
testdropcols = list(set(dropCols)-set(['Happy']))
test2 = test2.drop(testdropcols, axis=1)

for var in test2.columns:
    new = list(set(test2[var]) - set(train[var]))
    test2.ix[test2[var].isin(new), var] = np.nan

final_test = pd.get_dummies(test2)
missingCols = list(set(X.columns)-set(final_test.columns))
for col in missingCols:
    final_test[col] = 0
final_test = final_test[X.columns]
assert X.columns.equals(final_test.columns)
final_test = final_test.fillna(-999)

In [22]:
weight_test = [2 if data == 0 else 0.5 if data == 1 else 0.5 for data in y]

In [23]:
predictions_final = pd.DataFrame(index=final_test.index, columns=columns)
for (i,j,k,col) in zip(rounds,etas,seeds,columns):
    #XGB1
    #tuning the other parameters
    params = {}
    params["objective"] = "multi:softprob"
    #To avoid overfitting: The first way is to directly control model complexity
    params["min_child_weight"] = 6 #The larger, the more conservative the algorithm will be.
    params["max_depth"] = 5
    #params["gamma"] = 10 #The larger, the more conservative the algorithm will be.
    params["eta"] = j #higher is more conservative [0,1], if reduced then increase num_rounds
    #The second way is to add randomness to make training robust to noise
    params["subsample"] = 0.7
    params["colsample_bytree"] = 0.7

    #Handle Imbalanced Dataset
    #If you care only about the ranking order (AUC) of your prediction
    #params["scale_pos_weight"] = 1 #ratio of labels in target variable
    #params["eval_metric "] = 'mlogloss'
    #If you care about predicting the right probability
    #params["max_delta_step"]= 10 #should be high for skewed data

    params["seed"] = k
    params["silent"] = 1
    params["nthread"] = 4
    params["num_class"] = 3
    plst = list(params.items())
    num_rounds = i
    
    xgtrain = xgb.DMatrix(X, label=y, weight = weight_test, missing = -999)
    xgtest = xgb.DMatrix(final_test)
    model_full = xgb.train(plst, xgtrain, num_rounds)
    pred_finaltest = model_full.predict(xgtest)
    predictions_final[col] = np.argmax(pred_finaltest.reshape( final_test.shape[0], 3 ), axis=1) 

In [24]:
predictions_final['ID'] = test['ID']
predictions_final.to_csv('data/xgb.csv')