# INF582 AXA Challenge

## Initialisation

In [53]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [54]:
# Load main librairies
%matplotlib inline

import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import seaborn as sb
import datetime as dt
import itertools
import random

from sklearn.ensemble import GradientBoostingRegressor
from sklearn import __version__
from sklearn.grid_search import RandomizedSearchCV
from sklearn.cross_validation import train_test_split

pd.set_option('display.max_columns', 500)

## Load data

In [55]:
# Load data
submission = pd.read_csv("data/submission.txt", sep='\t')
training_data = pd.read_csv("data/train_2011_2012.csv", sep=';', nrows=10000,
                            na_values=['A Définir', 'A DEFINIR', '9999-12-31 00:00:00.000'],
                            usecols=['DATE','WEEK_END','DAY_WE_DS','TPER_TEAM','ASS_ASSIGNMENT','CSPL_RECEIVED_CALLS']
                            )
training_data.head()

Unnamed: 0,DATE,WEEK_END,DAY_WE_DS,TPER_TEAM,ASS_ASSIGNMENT,CSPL_RECEIVED_CALLS
0,2011-04-24 01:30:00.000,1,Dimanche,Nuit,Téléphonie,0
1,2011-04-24 01:30:00.000,1,Dimanche,Nuit,Finances PCX,0
2,2011-04-24 01:30:00.000,1,Dimanche,Nuit,Finances PCX,0
3,2011-04-24 01:30:00.000,1,Dimanche,Nuit,Téléphonie,0
4,2011-04-24 01:30:00.000,1,Dimanche,Nuit,Téléphonie,0


In [56]:
#list of departements with more than 1 million inhabitants
big_dept = ['59','75''13','92','93','33','62','78','77','94','44','31','76','91','38','95','67','34','06','83','57','35']


iter_csv = pd.read_csv("data/meteo_2012.csv", iterator=True, chunksize=1000,
                           #nrows=50000,
                           header=None,
                           usecols=[0,1,3,6]
                          )
meteo2012 = pd.concat([chunk[chunk[1].isin(big_dept)] for chunk in iter_csv])


meteo2012.rename(columns={0: 'DATE', 1: 'DEPT',3:'TEMP',6:'RAIN'}, inplace=True)

In [57]:
iter_csv = pd.read_csv("data/meteo_2011.csv", iterator=True, chunksize=1000,
                           #nrows=50000,
                           header=None,
                           usecols=[0,1,3,6]
                          )
meteo2011 = pd.concat([chunk[chunk[1].isin(big_dept)] for chunk in iter_csv])


meteo2011.rename(columns={0: 'DATE', 1: 'DEPT',3:'TEMP',6:'RAIN'}, inplace=True)


In [73]:


frames=[meteo2012,meteo2011]

meteo=pd.concat(frames)

def format_date(date):
    return dt.datetime.strptime(date, '%Y-%m-%d %H:%M').strftime('%Y-%m-%d %H:%M:%S.000')

meteo['DATE'] = meteo.DATE.map(format_date)

#Format meteo
for dept in sorted(meteo.DEPT.unique()):
    index = (meteo.DEPT == dept)
    meteo["{}_TEMP".format(dept)] = meteo.TEMP * index
    meteo["{}_RAIN".format(dept)] = meteo.RAIN * index

meteo.drop(["DEPT", "RAIN", "TEMP"], axis=1, inplace=True)
meteo = meteo.groupby("DATE").max().reset_index()

meteo.head()


Unnamed: 0,DATE,06_TEMP,06_RAIN,31_TEMP,31_RAIN,33_TEMP,33_RAIN,34_TEMP,34_RAIN,35_TEMP,35_RAIN,38_TEMP,38_RAIN,44_TEMP,44_RAIN,57_TEMP,57_RAIN,59_TEMP,59_RAIN,62_TEMP,62_RAIN,67_TEMP,67_RAIN,76_TEMP,76_RAIN,77_TEMP,77_RAIN,78_TEMP,78_RAIN,83_TEMP,83_RAIN,91_TEMP,91_RAIN,93_TEMP,93_RAIN,94_TEMP,94_RAIN,95_TEMP,95_RAIN
0,2011-01-01 00:00:00.000,8.2,0,8.2,0,7.1,0,10.0,0,1.6,0,2.4,0,2.1,0,0,0,5.1,0.0,4.0,0,0,0,1.2,0,0,0,0,0,11.4,0.0,0,0,0,0,0,0,0,0
1,2011-01-01 01:00:00.000,7.7,0,7.9,0,6.6,0,9.5,0,1.3,0,1.7,0,1.6,0,0,0,4.9,0.0,3.8,0,0,0,0.6,0,0,0,0,0,11.3,0.0,0,0,0,0,0,0,0,0
2,2011-01-01 02:00:00.000,7.5,0,7.1,0,7.0,0,9.0,0,1.2,0,1.4,0,1.4,0,0,0,4.8,0.0,3.8,0,0,0,0.3,0,0,0,0,0,11.1,0.2,0,0,0,0,0,0,0,0
3,2011-01-01 03:00:00.000,6.9,0,6.2,0,6.9,0,8.4,0,0.9,0,1.8,0,1.2,0,0,0,4.7,0.2,3.8,0,0,0,0.4,0,0,0,0,0,10.7,0.0,0,0,0,0,0,0,0,0
4,2011-01-01 04:00:00.000,6.8,0,6.3,0,6.5,0,8.4,0,0.8,0,2.3,0,1.2,0,0,0,5.0,0.0,3.8,0,0,0,0.5,0,0,0,0,0,10.5,0.0,0,0,0,0,0,0,0,0


In [74]:
meteo.describe()

Unnamed: 0,06_TEMP,06_RAIN,31_TEMP,31_RAIN,33_TEMP,33_RAIN,34_TEMP,34_RAIN,35_TEMP,35_RAIN,38_TEMP,38_RAIN,44_TEMP,44_RAIN,57_TEMP,57_RAIN,59_TEMP,59_RAIN,62_TEMP,62_RAIN,67_TEMP,67_RAIN,76_TEMP,76_RAIN,77_TEMP,77_RAIN,78_TEMP,78_RAIN,83_TEMP,83_RAIN,91_TEMP,91_RAIN,93_TEMP,93_RAIN,94_TEMP,94_RAIN,95_TEMP,95_RAIN
count,17008.0,16147.0,17008.0,16147.0,17008.0,16147.0,17008.0,16147.0,17008.0,16147.0,17008.0,16147.0,17008.0,16147.0,17008.0,16147.0,17008.0,16147.0,17008.0,16147.0,17008.0,16147.0,17008.0,16147.0,17008.0,16147.0,17008.0,16147.0,17008.0,16147.0,17008.0,16147.0,17008.0,16147.0,17008.0,16147.0,17008.0,16147.0
mean,17.084131,0.138162,15.195343,0.081786,15.741433,0.145631,17.39338,0.098148,13.040481,0.124983,14.106797,0.135877,14.153587,7.408423,12.309695,0.122654,12.667409,0.138763,12.688935,0.1952,11.91477,0.065226,12.731274,0.208447,11.496061,0.063349,12.513623,0.095318,18.41932,0.194408,11.935954,0.034006,12.940663,0.093138,11.896155,0.063374,12.052734,0.105778
std,7.228599,0.956197,7.651069,0.465517,6.857544,0.605958,7.170944,0.970825,5.827736,0.582436,8.104003,0.842872,5.739288,85.563586,7.370989,0.759631,6.113239,0.627405,5.914849,0.963048,7.950478,0.438676,5.698703,0.805386,7.174788,0.499858,6.594215,0.538255,7.058132,1.32083,7.034539,0.347805,6.705329,0.572116,7.151744,0.440901,7.059844,0.543905
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,11.8,0.0,9.8,0.0,11.4,0.0,12.3,0.0,9.2,0.0,7.7,0.0,10.5,0.0,6.6,0.0,8.3,0.0,8.6,0.0,5.7,0.0,8.9,0.0,6.4,0.0,7.8,0.0,13.0,0.0,6.8,0.0,8.2,0.0,7.0,0.0,7.3,0.0
50%,17.2,0.0,15.0,0.0,15.5,0.0,17.1,0.0,12.7,0.0,14.1,0.0,13.9,0.0,11.9,0.0,12.4,0.0,12.6,0.0,11.6,0.0,12.6,0.0,11.4,0.0,12.2,0.0,17.7,0.0,11.8,0.0,12.7,0.0,11.9,0.0,12.0,0.0
75%,22.7,0.0,20.5,0.0,20.1,0.0,22.6,0.0,16.8,0.0,19.9,0.0,17.925,0.0,17.7,0.0,16.9,0.0,16.5,0.0,17.9,0.0,16.5,0.0,16.6,0.0,17.0,0.0,23.3,0.0,16.8,0.0,17.6,0.0,17.0,0.0,17.1,0.0
max,36.5,40.1,39.1,16.0,40.2,17.0,37.7,64.0,34.5,21.0,42.0,31.6,36.0,1024.8,38.3,29.0,60.0,25.8,34.6,50.0,35.2,15.8,73.3,25.7,37.9,25.0,37.6,16.0,91.2,38.0,38.7,13.2,37.8,25.5,38.2,15.1,37.0,16.4


## Clean data

In [60]:
# Remove non usefull rows
training_data = training_data[training_data.ASS_ASSIGNMENT.isin(submission.ASS_ASSIGNMENT.unique())]

In [61]:
# Aggregate the calls
training_data = training_data.groupby([col for col in training_data.columns if not col == 'CSPL_RECEIVED_CALLS']).sum().reset_index()

In [62]:
#Extract the time slot from date
def get_time(date):
    time = dt.datetime.strptime(date,'%Y-%m-%d %H:%M:%S.000').time()
    return time.hour * 3600 + time.minute * 60 + time.second

def get_month(date):
    month = dt.datetime.strptime(date,'%Y-%m-%d %H:%M:%S.000').month
    return month

#Join weather
training_data = pd.merge(training_data, meteo, how='left', on=['DATE'])

#Deal with NAs. Would be better to replace with average
#training_data.fillna(0,inplace = True)

#Ca a l'air de fonctionner mais mieux vaut ne pas regarder trop près ce qui se passe...
training_data.set_index('DATE')
training_data.interpolate(method='index',inplace=True)

#Assign a number to the day of the week
day_to_num_dict = {j:i for i,j in enumerate(['Lundi','Mardi','Mercredi','Jeudi','Vendredi','Samedi','Dimanche'])}

training_data['TIME'] = training_data.DATE.map(get_time)
training_data['MONTH'] = training_data.DATE.map(get_month)
training_data['WEEK_DAY'] = training_data.DAY_WE_DS.map(day_to_num_dict)
training_data['NIGHT'] = (training_data.TPER_TEAM == "Nuit") * 1

# Remove obsolete columns
training_data = training_data[[col for col in training_data.columns if not col in ['DATE','DAY_WE_DS','TPER_TEAM']]]

In [63]:
#Convert the different ASS_ASSIGNMENTs to booleans
for value in submission.ASS_ASSIGNMENT.unique():
    training_data["ASS_ASSIGNMENT_"+value] = (training_data.ASS_ASSIGNMENT == value) * 1
    
# Remove obsolete column
training_data = training_data[[col for col in training_data.columns if not col == 'ASS_ASSIGNMENT']]

In [64]:
training_data.head()

Unnamed: 0,WEEK_END,CSPL_RECEIVED_CALLS,06_TEMP,06_RAIN,31_TEMP,31_RAIN,33_TEMP,33_RAIN,34_TEMP,34_RAIN,35_TEMP,35_RAIN,38_TEMP,38_RAIN,44_TEMP,44_RAIN,57_TEMP,57_RAIN,59_TEMP,59_RAIN,62_TEMP,62_RAIN,67_TEMP,67_RAIN,76_TEMP,76_RAIN,77_TEMP,77_RAIN,78_TEMP,78_RAIN,83_TEMP,83_RAIN,91_TEMP,91_RAIN,93_TEMP,93_RAIN,94_TEMP,94_RAIN,95_TEMP,95_RAIN,TIME,MONTH,WEEK_DAY,NIGHT,ASS_ASSIGNMENT_CAT,ASS_ASSIGNMENT_Téléphonie,ASS_ASSIGNMENT_Tech. Inter,ASS_ASSIGNMENT_Tech. Axa,ASS_ASSIGNMENT_Services,ASS_ASSIGNMENT_Regulation Medicale,ASS_ASSIGNMENT_RENAULT,ASS_ASSIGNMENT_Nuit,ASS_ASSIGNMENT_SAP,ASS_ASSIGNMENT_Japon,ASS_ASSIGNMENT_Gestion Renault,ASS_ASSIGNMENT_Gestion Amex,ASS_ASSIGNMENT_Gestion - Accueil Telephonique,ASS_ASSIGNMENT_Gestion,ASS_ASSIGNMENT_Domicile,ASS_ASSIGNMENT_Crises,ASS_ASSIGNMENT_Médical,ASS_ASSIGNMENT_Tech. Total,ASS_ASSIGNMENT_Mécanicien,ASS_ASSIGNMENT_Gestion Relation Clienteles,ASS_ASSIGNMENT_Manager,ASS_ASSIGNMENT_Gestion Clients,ASS_ASSIGNMENT_Gestion DZ,ASS_ASSIGNMENT_RTC,ASS_ASSIGNMENT_CMS,ASS_ASSIGNMENT_Prestataires,ASS_ASSIGNMENT_Gestion Assurances
0,1,0,8.2,0,8.2,0,7.1,0,10,0,1.6,0,2.4,0,2.1,0,0,0,5.1,0,4,0,0,0,1.2,0,0,0,0,0,11.4,0,0,0,0,0,0,0,0,0,0,1,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,1,0,8.2,0,8.2,0,7.1,0,10,0,1.6,0,2.4,0,2.1,0,0,0,5.1,0,4,0,0,0,1.2,0,0,0,0,0,11.4,0,0,0,0,0,0,0,0,0,0,1,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,8.2,0,8.2,0,7.1,0,10,0,1.6,0,2.4,0,2.1,0,0,0,5.1,0,4,0,0,0,1.2,0,0,0,0,0,11.4,0,0,0,0,0,0,0,0,0,0,1,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,8.2,0,8.2,0,7.1,0,10,0,1.6,0,2.4,0,2.1,0,0,0,5.1,0,4,0,0,0,1.2,0,0,0,0,0,11.4,0,0,0,0,0,0,0,0,0,0,1,5,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,8.2,0,8.2,0,7.1,0,10,0,1.6,0,2.4,0,2.1,0,0,0,5.1,0,4,0,0,0,1.2,0,0,0,0,0,11.4,0,0,0,0,0,0,0,0,0,0,1,5,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## View main statistics

In [65]:
training_data.describe()

Unnamed: 0,WEEK_END,CSPL_RECEIVED_CALLS,06_TEMP,06_RAIN,31_TEMP,31_RAIN,33_TEMP,33_RAIN,34_TEMP,34_RAIN,35_TEMP,35_RAIN,38_TEMP,38_RAIN,44_TEMP,44_RAIN,57_TEMP,57_RAIN,59_TEMP,59_RAIN,62_TEMP,62_RAIN,67_TEMP,67_RAIN,76_TEMP,76_RAIN,77_TEMP,77_RAIN,78_TEMP,78_RAIN,83_TEMP,83_RAIN,91_TEMP,91_RAIN,93_TEMP,93_RAIN,94_TEMP,94_RAIN,95_TEMP,95_RAIN,TIME,MONTH,WEEK_DAY,NIGHT,ASS_ASSIGNMENT_CAT,ASS_ASSIGNMENT_Téléphonie,ASS_ASSIGNMENT_Tech. Inter,ASS_ASSIGNMENT_Tech. Axa,ASS_ASSIGNMENT_Services,ASS_ASSIGNMENT_Regulation Medicale,ASS_ASSIGNMENT_RENAULT,ASS_ASSIGNMENT_Nuit,ASS_ASSIGNMENT_SAP,ASS_ASSIGNMENT_Japon,ASS_ASSIGNMENT_Gestion Renault,ASS_ASSIGNMENT_Gestion Amex,ASS_ASSIGNMENT_Gestion - Accueil Telephonique,ASS_ASSIGNMENT_Gestion,ASS_ASSIGNMENT_Domicile,ASS_ASSIGNMENT_Crises,ASS_ASSIGNMENT_Médical,ASS_ASSIGNMENT_Tech. Total,ASS_ASSIGNMENT_Mécanicien,ASS_ASSIGNMENT_Gestion Relation Clienteles,ASS_ASSIGNMENT_Manager,ASS_ASSIGNMENT_Gestion Clients,ASS_ASSIGNMENT_Gestion DZ,ASS_ASSIGNMENT_RTC,ASS_ASSIGNMENT_CMS,ASS_ASSIGNMENT_Prestataires,ASS_ASSIGNMENT_Gestion Assurances
count,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0,1584.0
mean,0.278409,0.756313,6.861616,0.099558,6.067551,0.049495,7.314015,0.06553,8.316604,0.013131,6.116856,0.044697,4.841982,0.031629,7.158207,0.043687,4.376831,0.010227,5.861174,0.051641,7.086995,0.032071,2.099116,0.009722,5.937311,0.06048,2.321402,0.020202,5.05404,0.012121,9.743813,0.081818,5.444634,0,5.73649,0.006313,2.61774,0.015025,2.641035,0.010732,371.590909,1.924874,2.940025,1,0.001263,0.051768,0.051136,0.046086,0.051768,0.048611,0.051136,0.051768,0.051136,0.051136,0.049242,0.041667,0.051136,0.051136,0.051136,0.04798,0.051136,0.036616,0.01452,0.016414,0.023359,0.02904,0.018939,0.011995,0.005051,0.003788,0.041035
std,0.448357,3.055182,3.032454,0.38698,3.742443,0.191632,3.769514,0.325499,3.707343,0.095531,3.861868,0.15426,3.91171,0.121051,3.328973,0.261896,3.593161,0.053873,3.537396,0.288389,3.661067,0.097746,3.581976,0.043024,3.30219,0.20782,3.72712,0.139835,3.456108,0.120032,1.845376,0.581104,3.524893,0,3.612798,0.045079,3.942557,0.09405,3.924318,0.053302,1367.366617,0.933703,1.992294,0,0.035522,0.221628,0.220345,0.209737,0.221628,0.215122,0.220345,0.221628,0.220345,0.220345,0.216442,0.199889,0.220345,0.220345,0.220345,0.213791,0.220345,0.187877,0.11966,0.127102,0.151087,0.167973,0.136354,0.108897,0.07091,0.061448,0.198435
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.7,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,4.7,0.0,3.2,0.0,4.6,0.0,5.6,0.0,2.6,0.0,1.6,0.0,4.6,0.0,1.2,0.0,2.8,0.0,4.361842,0.0,0.0,0.0,3.7,0.0,0.0,0.0,2.268421,0.0,8.7,0.0,2.6,0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,7.6,0.0,6.1,0.0,7.7,0.0,8.7,0.0,6.3,0.0,4.5,0.0,7.9,0.0,3.9,0.0,6.0,0.0,7.3,0.0,0.0,0.0,6.1,0.0,0.0,0.0,4.8,0.0,9.8,0.0,4.9,0,5.2,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,8.7,0.0,9.2,0.0,10.3,0.0,11.415789,0.0,9.4,0.0,7.0,0.0,9.4,0.0,7.9,0.0,8.7,0.0,9.6,0.0,3.75,0.0,8.5,0.0,4.975,0.0,8.1,0.0,11.1,0.0,8.7,0,8.5,0.0,5.8,0.0,7.3,0.0,0.0,3.0,5.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,33.0,15.0,2.6,13.1,1.2,16.7,2.6,14.8,0.8,12.8,1.2,17.3,0.6,13.8,2.4,12.0,0.4,12.8,2.4,15.9,0.6,11.5,0.2,12.2,1.2,11.7,1.2,12.5,1.2,14.5,5.6,13.0,0,12.8,0.4,12.0,0.8,12.3,0.4,5400.0,4.0,6.0,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


##  A simple predictor

Lets try building a tree-based boosting predictor with very few attributes just to see how it goes.
This predictor will only predict the number of calls received during a given time stamp (e.g. in a 30 minutes slot) and day of week.

Note that the date is not relevant for regression, but we can extract some relevant information from it: day of the week, time slot, and if it is a week-end or not.

Also, for some reason the data for a given ASS_ASSIGNMENT and DATE is sometimes split, so we have to aggregate it.

In [66]:
output_cols = ['CSPL_RECEIVED_CALLS']
input_cols = [col for col in training_data.columns if not col in output_cols]

In [67]:
#Now create the gradient boosting regressor

X_train, X_test, y_train, y_test = train_test_split(training_data[input_cols],  training_data[output_cols].values.ravel())

est = GradientBoostingRegressor()

tuned_parameters = {'loss' : ['ls'],'n_estimators':[50,80],'learning_rate': [0.5], 'subsample': [1.0],
                  'min_samples_split':[1,3],'min_samples_leaf':[1,3],
                    'max_depth':[5,8,15,20,25],'max_features':['auto']
                 }
                     

clf = RandomizedSearchCV(est, tuned_parameters, cv=5,n_jobs=-1,n_iter=20,verbose=1)

clf.fit(X_train, y_train)

est_temoin=GradientBoostingRegressor(n_estimators=100,max_depth=25)
est_temoin.fit(X_train,y_train)

print(clf.best_params_)
best_est=clf.best_estimator_

#Plot CV error (this is squared loss, which will be used to evaluate our performance in the leaderboard)

#Sur 10.000 lignes best_estimator est souvent moins bon que le temoin.. Par contre la différence est nette
# quand on utilise toutes les données.
#The higher the score the better.
print("Best estimator : %.4f" %best_est.score(X_test,y_test))
print("Temoin : %.4f" %est_temoin.score(X_test,y_test))



Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   20.8s finished


{'n_estimators': 50, 'min_samples_split': 3, 'subsample': 1.0, 'loss': 'ls', 'learning_rate': 0.5, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 15}
Best estimator : 0.7573
Temoin : 0.6897


# Prediction and submission

In [68]:
test_data = submission.copy()

#Join weather
test_data = pd.merge(test_data, meteo, how='left', on=['DATE'])

#Deal with NAs
test_data.set_index('DATE')
test_data.interpolate(method='index',inplace=True)


def get_weekday(date):
    return dt.datetime.strptime(date,'%Y-%m-%d %H:%M:%S.000').weekday()

test_data['TIME'] = test_data.DATE.map(get_time)
test_data['MONTH'] = test_data.DATE.map(get_month)
test_data['WEEK_DAY'] = test_data.DATE.map(get_weekday)
test_data['NIGHT'] = (np.logical_or(test_data.TIME >= (23*3600 + 30*60),
                                    test_data.TIME <  (7*3600  + 30*60))) * 1
test_data['WEEK_END'] = test_data.WEEK_DAY.isin([5, 6]) * 1

# Convert the different ASS_ASSIGNMENTs to booleans
for value in submission.ASS_ASSIGNMENT.unique():
    test_data["ASS_ASSIGNMENT_"+value] = (test_data.ASS_ASSIGNMENT == value) * 1

test_data = test_data[input_cols]

In [69]:
submission.prediction = est_temoin.predict(test_data)
submission.head()

Unnamed: 0,DATE,ASS_ASSIGNMENT,prediction
0,2012-01-03 00:00:00.000,CAT,8.3e-05
1,2012-01-03 00:00:00.000,Téléphonie,1.757567
2,2012-01-03 00:00:00.000,Tech. Inter,8.3e-05
3,2012-01-03 00:00:00.000,Tech. Axa,7.8e-05
4,2012-01-03 00:00:00.000,Services,1.99904


In [70]:
# Write prediction to csv
submission.to_csv("data/output.txt", sep='\t', index=False)