In [None]:
# Pandas : librairie de manipulation de données
# NumPy : librairie de calcul scientifique
# MatPlotLib : librairie de visualisation et graphiques
# SeaBorn : librairie de graphiques avancés
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score,auc, accuracy_score
from sklearn import ensemble
import xgboost as XGB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.linear_model import SGDClassifier




Fonction pour tracer les courbes d'apprentissage sur l'ensemble d'apprentissage et l'ensemble de validation :

In [None]:
from sklearn.model_selection import learning_curve
def plot_learning_curve(est, X_train, y_train) :
    train_sizes, train_scores, test_scores = learning_curve(estimator=est, X=X_train, y=y_train, train_sizes=np.linspace(0.1, 1.0, 10),
                                                        cv=5,
                                                        n_jobs=-1)
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    plt.figure(figsize=(8,10))
    plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy')
    plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
    plt.plot(train_sizes, test_mean,color='green', linestyle='--',marker='s', markersize=5,label='validation accuracy')
    plt.fill_between(train_sizes,test_mean + test_std,test_mean - test_std,alpha=0.15, color='green')
    plt.grid(b='on')
    plt.xlabel('Number of training samples')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.ylim([0, 1.0])
    plt.show()

# Récupération des données et Préprocessing

In [None]:
df = pd.read_csv("../input/starcraft-ii-replay-analysis/starcraft.csv")

In [None]:
df.columns

La colonne GameID pourrait fausser nos résultats car elle n'est pas utile. On peut la retirer.

In [None]:
df = df.drop(['GameID'], axis=1)
df = df.drop(['MaxTimeStamp'], axis=1)


On vérifie qu'il n'y a pas de valeur manquante

In [None]:
df.count()

Il manque des valeurs pour les colonnes Age, HoursPerWeek et TotalHours. Cela pourrait s'expliquer par le fait que les joueurs peuvent mettre leur profil en privé ce qui empêche la récuperation de ces informations.

In [None]:
plt.hist(df.Age, bins=80)

Pour chaque colonne, on remplace les NaN par des valeurs Random issues d'une loi normale.

In [None]:
import math
def replace_na(df,col) :
    df1 = df.copy()
    n = df.shape[0]
    m = df[col].mean()
    s = df[col].std()
    for i in range(n) :
        if math.isnan(df.loc[i,col]) :
            df1.loc[i,col] = np.random.normal(m,s)
    return df1

In [None]:
df = replace_na(df,'Age')
df = replace_na(df,'HoursPerWeek')
df = replace_na(df,'TotalHours')

In [None]:
df.count()

Lors de me rechercherches bibliographiques j'ai pu voir qu'un utilisateur avait remarqué qu'un joueur de 18 ans avait un nombre d'heure égal à 1 000 000. Cela est parfaitement impossible aussi ai-je décidé comme lui de supprimer cette valeur.

In [None]:
# Code de Jeff Lee : SC2 Data Analysis and League Index Prediction
df = df[df['TotalHours']!=1000000]
print('Remaining records in df= ' + str(len(df)))

In [None]:
plt.hist(df.Age, bins=80)

On peut voir pour Age  que des valeurs random ont été ajoutées. Certains joueurs doivent surement avoir un age de 23.5. 

In [None]:
df.LeagueIndex.value_counts()

## Recherche de corrélations

In [None]:
tabcorr = df.corr()
sns.clustermap(abs(tabcorr), cmap="coolwarm")

In [None]:
# Code by Jeff Lee : SC2 Data Analysis and League Index Prediction

#Set figure style
plt.style.use('fivethirtyeight')

# Create figure
fig, axes = plt.subplots(nrows=1, ncols = 1, figsize = (14,10))
fig.suptitle('Attribute Relationships', fontsize=22, fontweight='bold')
# fig.subplots_adjust(top=0.95)

# Generate a mask to hide the upper triangle for a cleaner heatmap.  Less visual noise the better.
mask = np.zeros_like(df.corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Create correlation matrix heatma[]
r_matrix = df.corr().round(decimals=1)
sns.heatmap(r_matrix, mask=mask, square=True, cmap='coolwarm', linewidths=.5, annot=True, fmt='g', 
            annot_kws={'size':10})
axes.set_title('     Correlation Matrix\n')
plt.show()

In [None]:
for column in df:
    plt.figure()
    sns.boxplot(x="LeagueIndex", y=column, data=df)


# Premier essai : Prédiction de LeagueIndex

## Création des jeux d'apprentissage et de test

In [None]:
X = df.drop(['LeagueIndex'], axis=1)
y = df.LeagueIndex

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

## Random Forests

In [None]:

rf = ensemble.RandomForestClassifier()
rf.fit(X_train, y_train)
y_rf = rf.predict(X_test)
rf_score = accuracy_score(y_test, y_rf)
print(rf_score)

In [None]:
importances = rf.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12,8))
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), df.columns[indices])
plt.title('Importance des caracteristiques')

Le résultat est vraiment mauvais comme je l'attendais au vu des mes recherches bibliographique. Nous allons néanmoins tester avec d'autres modèle même si je en m'attends pas à un meilleur résultat.

In [None]:
plot_learning_curve(rf, X, y)

## XGBoost

In [None]:
#Entrainement
xgb  = XGB.XGBClassifier()
xgb.fit(X_train, y_train)
#Prédiction
y_xgb = xgb.predict(X_test)
#Calcul du score
rf_score = accuracy_score(y_test, y_xgb)
print(rf_score)

In [None]:
print(classification_report(y_test, y_xgb))

In [None]:
plot_learning_curve(xgb, X, y)

## Régression logistique

In [None]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_lr = lr.predict(X_test)
rf_score = accuracy_score(y_test, y_lr)
print(rf_score)

In [None]:
plot_learning_curve(lr, X, y)

## Support Vector Machines

In [None]:
clf = svm.SVC()
clf.fit(X_train, y_train)
y_clf = clf.predict(X_test)
cm = confusion_matrix(y_test, y_clf)
clf_score = accuracy_score(y_test, y_lr)
print(clf_score)
print(cm)

In [None]:
plot_learning_curve(clf, X, y)

## Nearest Centroid Classifier

In [None]:
clf = NearestCentroid()
clf.fit(X_train, y_train)
y_clf = clf.predict(X_test)
rf_score = accuracy_score(y_test, y_lr)
print(rf_score)

In [None]:
plot_learning_curve(clf, X, y)

# Deuxième essai : Haute ou Basse Ligue 

In [None]:
def league_tiers(n) :
        if n <= 4 :
            return 'LOW'
        else :
            return 'HIGH'
df1 = df.copy();
df1['LeagueTiers'] = list(map(league_tiers, df['LeagueIndex']))

# Check mapping was applied correctly
df1[['LeagueIndex', 'LeagueTiers']].head(10)

In [None]:
df1.count()

In [None]:
#on drop LeagueIndex qui ne nous ai plus utile
df1 = df1.drop(['LeagueIndex'], axis=1)

## Création des jeux d'apprentissage et de test

In [None]:
X = df1.drop(['LeagueTiers'], axis=1)
y = df1.LeagueTiers
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print(y_train)

## Random Forest

In [None]:
from sklearn import ensemble
rf = ensemble.RandomForestClassifier()
rf.fit(X_train, y_train)
y_rf = rf.predict(X_test)
rf_score = accuracy_score(y_test, y_rf)
cm = confusion_matrix(y_test, y_rf)
print(rf_score)
print(cm)

In [None]:
plot_learning_curve(rf, X, y)

## XGBoost

In [None]:
#Entrainement
xgb  = XGB.XGBClassifier()
xgb.fit(X_train, y_train)
#Prédiction
y_xgb = xgb.predict(X_test)
#Calcul du score
rf_score = accuracy_score(y_test, y_xgb)
print(rf_score)

In [None]:
print(classification_report(y_test, y_xgb))

In [None]:
plot_learning_curve(xgb, X, y)

## Régression logistique

In [None]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_lr = lr.predict(X_test)
rf_score = accuracy_score(y_test, y_lr)
print(rf_score)

In [None]:
print(classification_report(y_test, y_lr))

In [None]:
plot_learning_curve(lr, X, y)

## Support Vector Machines

In [None]:
clf = svm.SVC()
clf.fit(X_train, y_train)
y_clf = clf.predict(X_test)
cm = confusion_matrix(y_test, y_clf)
clf_score = accuracy_score(y_test, y_clf)
print(clf_score)
print(cm)

In [None]:
print(classification_report(y_test, y_clf))

In [None]:
plot_learning_curve(clf, X, y)

## Nearest Centroid Classifier

In [None]:
clf = NearestCentroid()
clf.fit(X_train, y_train)
y_clf = clf.predict(X_test)
rf_score = accuracy_score(y_test, y_clf)
print(rf_score)

In [None]:
print(classification_report(y_test, y_clf))


In [None]:
plot_learning_curve(clf, X, y)

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_clf = gnb.predict(X_test)
rf_score = accuracy_score(y_test, y_clf)
print(rf_score)