# INF582 AXA Challenge

## Initialisation

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Load main librairies
%matplotlib inline

import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import seaborn as sb
import datetime as dt
import itertools
import random

from sklearn.ensemble import GradientBoostingRegressor

pd.set_option('display.max_columns', 500)

## Load data

In [None]:
# Load data
training_data = pd.read_csv("data/train_2011_2012.csv", sep=';', nrows=100000,
                            na_values=['NaN', 'A DÃ©finir', 'A DEFINIR', '9999-12-31 00:00:00.000'])
training_data.head()

## Clean data

In [None]:
# Remove columns containing only one value
for name in training_data.columns:
    count = training_data[name].unique().size
    if count <= 1:
        training_data.drop(name, 1, inplace=True)
        
training_data.head(n=10)

## View main statistics

In [None]:
training_data.describe()

##  A simple predictor

Lets try building a tree-based boosting predictor with very few attributes just to see how it goes.
This predictor will only predict the number of calls received during a given time stamp (e.g. in a 30 minutes slot) and day of week.

Note that the date is not relevant for regression, but we can extract some relevant information from it: day of the week, time slot, and if it is a week-end or not.

Also, for some reason the data for a given ASS_ASSIGNMENT and DATE is sometimes split, so we have to aggregate it.

In [None]:
#Select only basic attributes

#Extract the time slot from date
def get_time(date):
    time = dt.datetime.strptime(date,'%Y-%m-%d %H:%M:%S.000').time()
    return time.hour * 3600 + time.minute * 60 + time.second

#Assign a number to the day of the week
day_to_num_dict = {j:i for i,j in enumerate(['Lundi','Mardi','Mercredi','Jeudi','Vendredi','Samedi','Dimanche'])}

training_data['TIME'] = training_data['DATE'].map(get_time)
training_data['DAY'] = training_data['DAY_WE_DS'].map(day_to_num_dict)
training_data_simple = training_data[['DATE','TIME','ASS_ASSIGNMENT','DAY','WEEK_END','CSPL_RECEIVED_CALLS']]

#Aggregate the calls
grouped = training_data_simple.groupby(['DATE','TIME','WEEK_END','DAY','ASS_ASSIGNMENT']).sum().reset_index()

#Now that we aggregated the calls, the date is not relevant (only the time slot is)
grouped = grouped.drop('DATE',axis=1)
grouped.head(n=20)

#Convert the different ASS_ASSIGNMENTs to booleans
for value in grouped.ASS_ASSIGNMENT.unique():
        grouped["ASS_ASSIGNMENT_"+value] = (grouped.ASS_ASSIGNMENT == value)

#Extract features in X, target in y
cols = [col for col in grouped.columns if col not in ['CSPL_RECEIVED_CALLS', 'ASS_ASSIGNMENT']]
X = grouped[cols]
y = grouped['CSPL_RECEIVED_CALLS']

X.head(n=20)

In [None]:
#Now create the gradient boosting regressor
est = GradientBoostingRegressor(n_estimators = 100).fit(X,y)

#Plot training error (this is squared loss, which will be used to evaluate our performance in the leaderboard)
plt.plot(np.arange(est.n_estimators)+1, est.train_score_,color='r')

TODO: Use sklearn.grid_search.GridSearchCV to find the best set of parameters.
TODO: Write code to make a first submission to the leaderboard