# INF582 AXA Challenge

## Initialisation

In [31]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
# Load main librairies
%matplotlib inline

import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import seaborn as sb
import datetime as dt
import itertools
import random

from sklearn.ensemble import GradientBoostingRegressor
from sklearn import __version__
from sklearn.grid_search import RandomizedSearchCV
from sklearn.cross_validation import train_test_split

pd.set_option('display.max_columns', 500)

## Load data

In [33]:
# Load data
submission = pd.read_csv("data/submission.txt", sep='\t')
training_data = pd.read_csv("data/train_2011_2012.csv", sep=';', nrows=500000,
                            #na_values=['A Définir', 'A DEFINIR', '9999-12-31 00:00:00.000'],
                            usecols=['DATE','WEEK_END','DAY_WE_DS','TPER_TEAM','ASS_ASSIGNMENT','CSPL_RECEIVED_CALLS'])
training_data.head()

Unnamed: 0,DATE,WEEK_END,DAY_WE_DS,TPER_TEAM,ASS_ASSIGNMENT,CSPL_RECEIVED_CALLS
0,2011-04-24 01:30:00.000,1,Dimanche,Nuit,Téléphonie,0
1,2011-04-24 01:30:00.000,1,Dimanche,Nuit,Finances PCX,0
2,2011-04-24 01:30:00.000,1,Dimanche,Nuit,Finances PCX,0
3,2011-04-24 01:30:00.000,1,Dimanche,Nuit,Téléphonie,0
4,2011-04-24 01:30:00.000,1,Dimanche,Nuit,Téléphonie,0


## Clean data

In [34]:
# Remove non usefull rows
training_data = training_data[training_data.ASS_ASSIGNMENT.isin(submission.ASS_ASSIGNMENT.unique())]

In [35]:
# Aggregate the calls
training_data = training_data.groupby([col for col in training_data.columns if not col == 'CSPL_RECEIVED_CALLS']).sum().reset_index()

In [36]:
#Extract the time slot from date
def get_time(date):
    time = dt.datetime.strptime(date,'%Y-%m-%d %H:%M:%S.000').time()
    return time.hour * 3600 + time.minute * 60 + time.second

#Assign a number to the day of the week
day_to_num_dict = {j:i for i,j in enumerate(['Lundi','Mardi','Mercredi','Jeudi','Vendredi','Samedi','Dimanche'])}

training_data['TIME'] = training_data.DATE.map(get_time)
training_data['WEEK_DAY'] = training_data.DAY_WE_DS.map(day_to_num_dict)
training_data['NIGHT'] = (training_data.TPER_TEAM == "Nuit") * 1

# Remove obsolete columns
training_data = training_data[[col for col in training_data.columns if not col in ['DATE','DAY_WE_DS','TPER_TEAM']]]

In [37]:
#Convert the different ASS_ASSIGNMENTs to booleans
for value in submission.ASS_ASSIGNMENT.unique():
    training_data["ASS_ASSIGNMENT_"+value] = (training_data.ASS_ASSIGNMENT == value) * 1
    
# Remove obsolete column
training_data = training_data[[col for col in training_data.columns if not col == 'ASS_ASSIGNMENT']]

In [38]:
training_data.head()

Unnamed: 0,WEEK_END,CSPL_RECEIVED_CALLS,TIME,WEEK_DAY,NIGHT,ASS_ASSIGNMENT_CAT,ASS_ASSIGNMENT_Téléphonie,ASS_ASSIGNMENT_Tech. Inter,ASS_ASSIGNMENT_Tech. Axa,ASS_ASSIGNMENT_Services,ASS_ASSIGNMENT_Regulation Medicale,ASS_ASSIGNMENT_RENAULT,ASS_ASSIGNMENT_Nuit,ASS_ASSIGNMENT_SAP,ASS_ASSIGNMENT_Japon,ASS_ASSIGNMENT_Gestion Renault,ASS_ASSIGNMENT_Gestion Amex,ASS_ASSIGNMENT_Gestion - Accueil Telephonique,ASS_ASSIGNMENT_Gestion,ASS_ASSIGNMENT_Domicile,ASS_ASSIGNMENT_Crises,ASS_ASSIGNMENT_Médical,ASS_ASSIGNMENT_Tech. Total,ASS_ASSIGNMENT_Mécanicien,ASS_ASSIGNMENT_Gestion Relation Clienteles,ASS_ASSIGNMENT_Manager,ASS_ASSIGNMENT_Gestion Clients,ASS_ASSIGNMENT_Gestion DZ,ASS_ASSIGNMENT_RTC,ASS_ASSIGNMENT_CMS,ASS_ASSIGNMENT_Prestataires,ASS_ASSIGNMENT_Gestion Assurances
0,1,0,0,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,5,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,5,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## View main statistics

In [39]:
training_data.describe()

Unnamed: 0,WEEK_END,CSPL_RECEIVED_CALLS,TIME,WEEK_DAY,NIGHT,ASS_ASSIGNMENT_CAT,ASS_ASSIGNMENT_Téléphonie,ASS_ASSIGNMENT_Tech. Inter,ASS_ASSIGNMENT_Tech. Axa,ASS_ASSIGNMENT_Services,ASS_ASSIGNMENT_Regulation Medicale,ASS_ASSIGNMENT_RENAULT,ASS_ASSIGNMENT_Nuit,ASS_ASSIGNMENT_SAP,ASS_ASSIGNMENT_Japon,ASS_ASSIGNMENT_Gestion Renault,ASS_ASSIGNMENT_Gestion Amex,ASS_ASSIGNMENT_Gestion - Accueil Telephonique,ASS_ASSIGNMENT_Gestion,ASS_ASSIGNMENT_Domicile,ASS_ASSIGNMENT_Crises,ASS_ASSIGNMENT_Médical,ASS_ASSIGNMENT_Tech. Total,ASS_ASSIGNMENT_Mécanicien,ASS_ASSIGNMENT_Gestion Relation Clienteles,ASS_ASSIGNMENT_Manager,ASS_ASSIGNMENT_Gestion Clients,ASS_ASSIGNMENT_Gestion DZ,ASS_ASSIGNMENT_RTC,ASS_ASSIGNMENT_CMS,ASS_ASSIGNMENT_Prestataires,ASS_ASSIGNMENT_Gestion Assurances
count,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0,59405.0
mean,0.266139,4.542715,21802.922313,2.925983,0.576601,0.013602,0.047757,0.047757,0.044946,0.047757,0.043633,0.047757,0.047723,0.047488,0.047723,0.046208,0.038987,0.047269,0.047723,0.047757,0.040468,0.047757,0.037286,0.022742,0.02318,0.030098,0.029055,0.023601,0.017995,0.012777,0.011043,0.039912
std,0.441942,16.319355,12556.46889,1.973153,0.494102,0.115831,0.213253,0.213253,0.207187,0.213253,0.204278,0.213253,0.213182,0.212681,0.213182,0.209938,0.193565,0.212215,0.213182,0.213253,0.197056,0.213253,0.189464,0.149082,0.150476,0.17086,0.167962,0.151803,0.132935,0.112311,0.104504,0.195755
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,10800.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,23400.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,32400.0,5.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,344.0,46800.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


##  A simple predictor

Lets try building a tree-based boosting predictor with very few attributes just to see how it goes.
This predictor will only predict the number of calls received during a given time stamp (e.g. in a 30 minutes slot) and day of week.

Note that the date is not relevant for regression, but we can extract some relevant information from it: day of the week, time slot, and if it is a week-end or not.

Also, for some reason the data for a given ASS_ASSIGNMENT and DATE is sometimes split, so we have to aggregate it.

In [40]:
output_cols = ['CSPL_RECEIVED_CALLS']
input_cols = [col for col in training_data.columns if not col in output_cols]

In [46]:
#Now create the gradient boosting regressor

X_train, X_test, y_train, y_test = train_test_split(training_data[input_cols],  training_data[output_cols].values.ravel())

est = GradientBoostingRegressor()

tuned_parameters = {'loss' : ['ls', 'lad', 'huber', 'quantile'],'n_estimators':[30],'learning_rate': [0.3,0.5,0.7], 'subsample': [1.0],
                     'min_samples_split':[1,3],'min_samples_leaf':[1,2],
                     'max_depth':[3,5],'max_features':['auto']
                    }
                     

clf = RandomizedSearchCV(est, tuned_parameters, cv=5,n_jobs=-1,n_iter=18,verbose=1)

clf.fit(X_train, y_train)

est_temoin=GradientBoostingRegressor()
est_temoin.fit(X_train,y_train)

print(clf.best_params_)
best_est=clf.best_estimator_

#Plot CV error (this is squared loss, which will be used to evaluate our performance in the leaderboard)

#Sur 10.000 lignes best_estimator est souvent moins bon que le temoin.. Par contre la différence est nette
# quand on utilise toutes les données.
#The higher the score the better.
print("Best estimator : %.4f" %best_est.score(X_test,y_test))
print("Temoin : %.4f" %est_temoin.score(X_test,y_test))



Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  2.0min finished


{'min_samples_split': 3, 'loss': 'ls', 'max_features': 'auto', 'max_depth': 5, 'learning_rate': 0.5, 'min_samples_leaf': 1, 'subsample': 1.0, 'n_estimators': 30}
Best estimator : 0.9261
Temoin : 0.8930


# Prediction and submission

In [42]:
test_data = submission.copy()

def get_weekday(date):
    return dt.datetime.strptime(date,'%Y-%m-%d %H:%M:%S.000').weekday()

test_data['TIME'] = test_data.DATE.map(get_time)
test_data['WEEK_DAY'] = test_data.DATE.map(get_weekday)
test_data['NIGHT'] = (np.logical_or(test_data.TIME >= (23*3600 + 30*60),
                                    test_data.TIME <  (7*3600  + 30*60))) * 1
test_data['WEEK_END'] = test_data.WEEK_DAY.isin([5, 6]) * 1

# Convert the different ASS_ASSIGNMENTs to booleans
for value in submission.ASS_ASSIGNMENT.unique():
    test_data["ASS_ASSIGNMENT_"+value] = (test_data.ASS_ASSIGNMENT == value) * 1

test_data = test_data[input_cols]

In [43]:
submission.prediction = best_est.predict(test_data)
submission.head()

Unnamed: 0,DATE,ASS_ASSIGNMENT,prediction
0,2012-01-03 00:00:00.000,CAT,0.167185
1,2012-01-03 00:00:00.000,Téléphonie,0.757762
2,2012-01-03 00:00:00.000,Tech. Inter,0.009142
3,2012-01-03 00:00:00.000,Tech. Axa,-0.003266
4,2012-01-03 00:00:00.000,Services,0.328637


In [44]:
# Write prediction to csv
submission.to_csv("data/output.txt", sep='\t', index=False)