In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

pd.plotting.register_matplotlib_converters()
plt.rc("figure", figsize=(12,5))
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Chargement des jeux de données

In [None]:
train = pd.read_csv("/kaggle/input/jane-street-market-prediction/train.csv")#data

sample = pd.read_csv("/kaggle/input/jane-street-market-prediction/example_sample_submission.csv")#test

test =  pd.read_csv("/kaggle/input/jane-street-market-prediction/example_test.csv")

feature = pd.read_csv("/kaggle/input/jane-street-market-prediction/features.csv")

## Ajout de la colonne Action 
Pour ajouter cette colonne, il faut recuperer les informations des colonnes ['resp_1','resp_2','resp_3', 'resp_4', 'resp']. Si ces cinq valeurs sont positifs alors la valeur de la nouvelle colonne ['action'] est fixé a 1.

In [None]:
train['action'] =  ((train['resp_1'] > 0 ) & (train['resp_2'] > 0 ) & (train['resp_3'] > 0 ) & (train['resp_4'] > 0 ) & (train['resp'] > 0 )).astype('int')


In [None]:
size = int(len(train)*0.1)
train_reduct = train.iloc[:size]
np.shape(train_reduct)

## Visualisation de la répartition des classes 
La colonne ['action'] est la variable à predire. Il est interessant d'observer la répartition des classes de la variable à prédire.
Si la proportion des classes est déséquilibrée alors le résultat de la prédiction peut être biaisé.

In [None]:
train["action"].value_counts()

In [None]:
list_pct = []
for i in range(0,2):
    value = len(train_reduct[train_reduct["action"]==i])
    value_pct = value/len(train_reduct)*100
    list_pct.append(value_pct)
plt.bar(['0','1'], [list_pct[0],list_pct[1]])

## Remplacement des valeurs manquantes
Cette étape est importante car la valeur choisie pour le remplacement peut modifier le résultat de prédiction.


In [None]:
train_r = train_reduct.replace(np.nan, 0)

## Etude des liens entre les variables
Cette partie permet de visualiser les liens linéaires entre les variables explicatives et la variable cible

In [None]:
col_features = [c for c in train.columns if 'feature' in c]

In [None]:
var_exp = train_r[col_features]
var_target = train_r["action"]
corr = var_exp.corrwith(var_target)

In [None]:
corr.plot.bar(figsize = (30, 15),title = "Correlation" , fontsize = 20,
        rot = 90, grid = True)

## Importance des variables
L'analyse en composante principale permet d'observer le nombre de variable apportant de l'information

In [None]:

x_exp = var_exp.values
x_aexp = var_target.values
mean = np.mean(x_exp, axis=0)
cov_matrix = (x_exp - mean).T.dot((x_exp - mean)) / (x_exp.shape[0]-1)

nbr_exp = np.shape(x_exp)[1]

# normalisation des donnees
from sklearn.preprocessing import StandardScaler

X_std = StandardScaler().fit_transform(x_exp)
# extraction des valeurs propres et vecteurs propres de la matrice de covariance
eig_vals, eig_vecs = np.linalg.eig(cov_matrix)
# creation de paire de valeurs et vecteurs propres
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
eig_pairs.sort(key=lambda x: x[0], reverse=True)
tot = sum(eig_vals)
# calcul de la variance expliquee
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)]


In [None]:
plt.figure(figsize=(15, 5))

plt.bar(range(nbr_exp), var_exp, alpha=0.5, align='center',
    label='Variance expliquée')
plt.ylabel('Variance expliquée ratio')
plt.xlabel('Composantes principales')
plt.legend(loc='best')
plt.tight_layout()


## Préparation à la prédiction
Il faut dans un premier temps séparer le jeu de données en deux parties: jeu d'entrainement et jeu de test
Ensuite il faut normaliser le jeu d'entrainement pour ensuite normaliser le jeu de test.

In [None]:
from sklearn.preprocessing import MinMaxScaler

pourc_train = 0.7
col_target = "action"
col_feature = [c for c in train_reduct.columns if 'feature' in c]


# X : matrice de variables explicatives 
X = train_r[col_feature]
# Y : vecteur (cas univarié) ou matrice (cas multivarié) de variable a expliquer
Y = np.array(train_r[col_target]).reshape(-1,1)

size = int(len(train_r)*pourc_train)
X_train, Y_train = X[:size], Y[:size]
X_test, Y_test = X[size:], Y[size:]

# Normalisation
scaler = MinMaxScaler()
scaler = MinMaxScaler()
X_train_scale = scaler.fit_transform(X_train)
Y_train_scale = scaler.fit_transform(Y_train)

X_test_scale = scaler.transform(X_test)
Y_test_scale = scaler.transform(Y_test)

## Modèle de prédiction
Quatre modèles vont être challenger pour la classification: Random Forest, Regression Logistique, SVM et un modèle à base de réseau de neurone récurrent

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import mean_squared_error

model_rfc = RandomForestClassifier(max_depth=2, random_state=0)
model_rfc.fit(X_train, Y_train)
pred_rfc = model_rfc.predict(X_test) 

model_rlog = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X_train, Y_train)
model_rlog.fit(X_train, Y_train)
pred_rlog = model_rlog.predict(X_test) 

model_svm = svm.LinearSVC()
model_svm.fit(X_train, Y_train)
pred_svm = model_svm.predict(X_test) 

## Evaluation de la performance

In [None]:
pip install termtables


In [None]:
from sklearn.metrics import confusion_matrix

y_pred = pred_rfc
y_true = Y_test_scale.reshape(1,-1)[0]
print("Matrice de confusion:")
tn, fp, fn, tp = confusion_matrix(y_true,y_pred).ravel()
val_0_b = np.round(tn/len(y_true)*100,2)
val_0_m = np.round(fn/len(y_true)*100,2)
val_1_b = np.round(tp/len(y_true)*100,2)
val_1_m = np.round(fp/len(y_true)*100,2)
import termtables
header = [" ", "Classe 0", "Classe 1"]
data = [
    ["Bien classé (%)", val_0_b, val_1_b],
    ["Mal classé (%)", val_0_m , val_1_m]]

table = termtables.to_string(data, header=header)
print(table)

In [None]:
import statsmodels.api as sm
sm.stats.acorr_ljungbox(train_reduct["feature_3"], lags=[10], return_df=True)

Verification de la distribution des variables explicatives

In [None]:
train_reduct[["feature_31","feature_32"]].boxplot()