# INF582 AXA Challenge

## Initialisation

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Load main librairies
%matplotlib inline

import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import seaborn as sb
import datetime as dt
import itertools
import random

from sklearn.ensemble import GradientBoostingRegressor
from sklearn import __version__
from sklearn.grid_search import RandomizedSearchCV
from sklearn.cross_validation import train_test_split

pd.set_option('display.max_columns', 500)

## Load data

In [None]:
# Load data
submission = pd.read_csv("data/submission.txt", sep='\t')
training_data = pd.read_csv("data/train_2011_2012.csv", sep=';', nrows=10000,
                            na_values=['A Définir', 'A DEFINIR', '9999-12-31 00:00:00.000'],
                            usecols=['DATE','WEEK_END','DAY_WE_DS','TPER_TEAM','ASS_ASSIGNMENT','CSPL_RECEIVED_CALLS']
                            )
training_data.head()

In [None]:
#list of departements with more than 1 million inhabitants
big_dept = ['59','75''13','92','93','33','62','78','77','94','44','31','76','91','38','95','67','34','06','83','57','35']


iter_csv = pd.read_csv("data/meteo_2012.csv", iterator=True, chunksize=1000,
                           #nrows=50000,
                           header=None,
                           usecols=[0,1,3,6]
                          )
meteo2012 = pd.concat([chunk[chunk[1].isin(big_dept)] for chunk in iter_csv])


meteo2012.rename(columns={0: 'DATE', 1: 'DEPT',3:'TEMP',6:'RAIN'}, inplace=True)

In [None]:
iter_csv = pd.read_csv("data/meteo_2011.csv", iterator=True, chunksize=1000,
                           #nrows=50000,
                           header=None,
                           usecols=[0,1,3,6]
                          )
meteo2011 = pd.concat([chunk[chunk[1].isin(big_dept)] for chunk in iter_csv])


meteo2011.rename(columns={0: 'DATE', 1: 'DEPT',3:'TEMP',6:'RAIN'}, inplace=True)


In [None]:


frames=[meteo2012,meteo2011]

meteo=pd.concat(frames)

def format_date(date):
    return dt.datetime.strptime(date, '%Y-%m-%d %H:%M').strftime('%Y-%m-%d %H:%M:%S.000')

meteo['DATE'] = meteo.DATE.map(format_date)

#Format meteo
for dept in sorted(meteo.DEPT.unique()):
    index = (meteo.DEPT == dept)
    meteo["{}_TEMP".format(dept)] = meteo.TEMP * index
    meteo["{}_RAIN".format(dept)] = meteo.RAIN * index

meteo.drop(["DEPT", "RAIN", "TEMP"], axis=1, inplace=True)
meteo = meteo.groupby("DATE").max().reset_index()

meteo.head()


In [None]:
meteo.describe()

## Clean data

In [None]:
# Remove non usefull rows
training_data = training_data[training_data.ASS_ASSIGNMENT.isin(submission.ASS_ASSIGNMENT.unique())]

In [None]:
# Aggregate the calls
training_data = training_data.groupby([col for col in training_data.columns if not col == 'CSPL_RECEIVED_CALLS']).sum().reset_index()

In [None]:
#Extract the time slot from date
def get_time(date):
    time = dt.datetime.strptime(date,'%Y-%m-%d %H:%M:%S.000').time()
    return time.hour * 3600 + time.minute * 60 + time.second

def get_month(date):
    month = dt.datetime.strptime(date,'%Y-%m-%d %H:%M:%S.000').month
    return month

#Join weather
training_data = pd.merge(training_data, meteo, how='left', on=['DATE'])

#Deal with NAs. Would be better to replace with average
#training_data.fillna(0,inplace = True)

#Ca a l'air de fonctionner mais mieux vaut ne pas regarder trop près ce qui se passe...
training_data.set_index('DATE')
training_data.interpolate(method='index',inplace=True)

#Assign a number to the day of the week
day_to_num_dict = {j:i for i,j in enumerate(['Lundi','Mardi','Mercredi','Jeudi','Vendredi','Samedi','Dimanche'])}

training_data['TIME'] = training_data.DATE.map(get_time)
training_data['MONTH'] = training_data.DATE.map(get_month)
training_data['WEEK_DAY'] = training_data.DAY_WE_DS.map(day_to_num_dict)
training_data['NIGHT'] = (training_data.TPER_TEAM == "Nuit") * 1

# Remove obsolete columns
training_data = training_data[[col for col in training_data.columns if not col in ['DATE','DAY_WE_DS','TPER_TEAM']]]

In [None]:
#Convert the different ASS_ASSIGNMENTs to booleans
for value in submission.ASS_ASSIGNMENT.unique():
    training_data["ASS_ASSIGNMENT_"+value] = (training_data.ASS_ASSIGNMENT == value) * 1
    
# Remove obsolete column
training_data = training_data[[col for col in training_data.columns if not col == 'ASS_ASSIGNMENT']]

In [None]:
training_data.head()

## View main statistics

In [None]:
training_data.describe()

##  A simple predictor

Lets try building a tree-based boosting predictor with very few attributes just to see how it goes.
This predictor will only predict the number of calls received during a given time stamp (e.g. in a 30 minutes slot) and day of week.

Note that the date is not relevant for regression, but we can extract some relevant information from it: day of the week, time slot, and if it is a week-end or not.

Also, for some reason the data for a given ASS_ASSIGNMENT and DATE is sometimes split, so we have to aggregate it.

In [None]:
output_cols = ['CSPL_RECEIVED_CALLS']
input_cols = [col for col in training_data.columns if not col in output_cols]

In [None]:
#Now create the gradient boosting regressor

X_train, X_test, y_train, y_test = train_test_split(training_data[input_cols],  training_data[output_cols].values.ravel())

est = GradientBoostingRegressor()

tuned_parameters = {'loss' : ['ls'],'n_estimators':[50,80],'learning_rate': [0.5], 'subsample': [1.0],
                  'min_samples_split':[1,3],'min_samples_leaf':[1,3],
                    'max_depth':[5,8,15,20,25],'max_features':['auto']
                 }
                     

clf = RandomizedSearchCV(est, tuned_parameters, cv=5,n_jobs=-1,n_iter=20,verbose=1)

clf.fit(X_train, y_train)

est_temoin=GradientBoostingRegressor(n_estimators=100,max_depth=25)
est_temoin.fit(X_train,y_train)

print(clf.best_params_)
best_est=clf.best_estimator_

#Plot CV error (this is squared loss, which will be used to evaluate our performance in the leaderboard)

#Sur 10.000 lignes best_estimator est souvent moins bon que le temoin.. Par contre la différence est nette
# quand on utilise toutes les données.
#The higher the score the better.
print("Best estimator : %.4f" %best_est.score(X_test,y_test))
print("Temoin : %.4f" %est_temoin.score(X_test,y_test))



# Prediction and submission

In [None]:
test_data = submission.copy()

#Join weather
test_data = pd.merge(test_data, meteo, how='left', on=['DATE'])

#Deal with NAs
test_data.set_index('DATE')
test_data.interpolate(method='index',inplace=True)


def get_weekday(date):
    return dt.datetime.strptime(date,'%Y-%m-%d %H:%M:%S.000').weekday()

test_data['TIME'] = test_data.DATE.map(get_time)
test_data['MONTH'] = test_data.DATE.map(get_month)
test_data['WEEK_DAY'] = test_data.DATE.map(get_weekday)
test_data['NIGHT'] = (np.logical_or(test_data.TIME >= (23*3600 + 30*60),
                                    test_data.TIME <  (7*3600  + 30*60))) * 1
test_data['WEEK_END'] = test_data.WEEK_DAY.isin([5, 6]) * 1

# Convert the different ASS_ASSIGNMENTs to booleans
for value in submission.ASS_ASSIGNMENT.unique():
    test_data["ASS_ASSIGNMENT_"+value] = (test_data.ASS_ASSIGNMENT == value) * 1

test_data = test_data[input_cols]

In [None]:
submission.prediction = est_temoin.predict(test_data)
submission.head()

In [None]:
# Write prediction to csv
submission.to_csv("data/output.txt", sep='\t', index=False)