In [31]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [14]:
# Loading all data bases 

branchements = pd.read_csv('../data_SUEZ/branchements.csv')
canalisations = pd.read_csv('../data_SUEZ/canalisations.csv')
evenements_train = pd.read_csv('../data_SUEZ/evenements_train.csv')
meteo_AlpesMaritimes = pd.read_csv('../data_SUEZ/meteo_AlpesMaritimes_pre_traite.csv')
nb_ev_train = pd.read_csv('../data_SUEZ/nb_ev_train.csv')
permut_train = pd.read_csv('../data_SUEZ/permut_train.csv')
stations_AlpesMaritimes = pd.read_csv('../data_SUEZ/stations_AlpesMaritimes.csv')

# Preprocessing

## Jointure

In [15]:
# Renaming columns

evenements_train.columns= ['GID_fuite', 'CODINSEE_fuite', 'date_fuite', 'GID_pose', 'COMPO_P', 'geometry_fuite']
branchements.columns= ['GID_pose', 'CODINSEE_pose', 'date_pose', 'DIAMETRE', 'DIAMEXT', 'DIAMINT', 'ETAGE',
       'GEORIENT', 'LONGCALC', 'MATERIAU', 'geometry_pose']
canalisations.columns= ['GID_pose', 'CODINSEE_pose', 'date_pose', 'DIAMETRE', 'DIAMEXT', 'DIAMINT', 'ETAGE',
       'GEORIENT', 'LONGCALC', 'MATERIAU', 'geometry_pose']

In [16]:
evenements_train['fuite'] = 1

In [17]:
# Différencier les GID des branchements et des canalisations

evenements_train['ajout_branch'] = '_b'
evenements_train['ajout_can'] = '_c'
evenements_train.ix[(evenements_train.COMPO_P == 'E_BRANCH'),'GID_pose'] = evenements_train.ix[(evenements_train.COMPO_P == 'E_BRANCH'),'GID_pose'].map(str) + evenements_train.ix[(evenements_train.COMPO_P == 'E_BRANCH'),'ajout_branch']
evenements_train.ix[(evenements_train.COMPO_P == 'E_TRONCO'),'GID_pose'] = evenements_train.ix[(evenements_train.COMPO_P == 'E_TRONCO'),'GID_pose'].map(str) + evenements_train.ix[(evenements_train.COMPO_P == 'E_TRONCO'),'ajout_can']
evenements_train.drop('ajout_branch', axis=1, inplace=True)
evenements_train.drop('ajout_can', axis=1, inplace=True)

branchements['ajout_branch'] = '_b'
branchements.GID_pose = branchements.GID_pose.map(str) + branchements.ajout_branch
branchements.drop('ajout_branch', axis=1, inplace=True)

canalisations['ajout_can'] = '_c'
canalisations.GID_pose = canalisations.GID_pose.map(str) + canalisations.ajout_can
canalisations.drop('ajout_can', axis=1, inplace=True)

In [18]:
# Merge elements on evenements (creation BDD complète)

elements = pd.concat([canalisations, branchements])
data = pd.merge(elements, evenements_train, on = 'GID_pose', how = 'left')

## Feature engineering

In [19]:
# Utiliser le code INSEE de la fuite quand on l'a plutot que celui de la pose
data.ix[~data.CODINSEE_fuite.isnull(), 'CODINSEE_pose'] = data.ix[~data.CODINSEE_fuite.isnull(), 'CODINSEE_fuite']
data.drop('CODINSEE_fuite', axis = 1, inplace = True)

In [20]:
# Convertir âge

def to_day(x):
    if (str(x) == 'NaT'):
        return np.nan
    else:
        return x.days

data['age'] = pd.to_datetime('2016-12-31') - pd.to_datetime(data['date_pose'])
data.age = data.age.map(to_day)

In [25]:
data.ix[data.fuite.isnull(), 'fuite'] = 0

In [32]:
# Encode categorical variables

le = LabelEncoder()
mapping = dict()
for col, dtype in zip(data.columns, data.dtypes):
    if dtype == 'object':
        data[col] = data[col].apply(lambda s: str(s))
        # Replace 0 and NaNs with unique label : 'None'
        data[col] = data[col].where(~data[col].isin(['0', 'nan']), 'None')
        data[col] = le.fit_transform(data[col])
        mapping[col] = dict(zip(le.inverse_transform(data[col].unique()), data[col].unique()))

In [60]:
# Replace NaNs with 0
data = data.apply(lambda x: x.fillna(x.mean()),axis=0)

# Training

## Séparation submit et jeu d'entrainement

In [94]:
# Séparer le dataset d'entrainment du dataset de test
test_insee = [6003, 6004, 6016, 6030, 6070, 6095, 6108, 6112, 6116, 6138, 6152]
data_known = data[~data['CODINSEE_pose'].isin(test_insee)]
submit = data[data['CODINSEE_pose'].isin(test_insee)]

In [95]:
# Remove unnecessary columns
fuite_columns = ['GID_fuite', 'date_fuite', 'geometry_fuite']
id_columns = ['GID_pose', 'CODINSEE_pose']
a_predir = ['fuite']
autres = ['GEORIENT', 'date_pose', 'geometry_pose', 'ETAGE', 'DIAMINT', 'DIAMEXT']
mask = id_columns + fuite_columns + a_predir + autres  # geometry_pose pas encore exploitable

X = data_known.drop(mask, axis = 1)
y = data_known['fuite']

X_submit = submit.drop(mask, axis = 1)

## Random Forest

In [96]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

clf.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

# Submitting

In [97]:
y_submit = clf.predict(X_submit)

In [98]:
sum(y_submit)

0.0

In [93]:
X.columns

Index([u'DIAMETRE', u'LONGCALC', u'MATERIAU', u'COMPO_P', u'age'], dtype='object')