In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('/kaggle/input/student-alcohol-consumption/student-mat.csv')
df = data.copy()

In [None]:
pd.set_option('display.max_row',33) #Affiche au plus 33 éléments dans les résultats de pandas
pd.set_option('display.max_column',33) #Affiche au plus 33 éléments dans les résultats de pandas
df.head()

# Feature Engineering
## Création de la moyenne sur l'année

In [None]:
df['Notes'] = ((df['G1']+df['G2']+df['G3'])/3)//7 
# 0 = Notes compriese entre 0 et 7
# 1 = Notes comprises entre 7 et 14
# 2 = Notes comprises entre 14 et 21 (donc 20 comme c'est le maximum)

In [None]:
Mauvais = df[df['Notes'] == 0]
Excellent = df[df['Notes'] == 2]

# Création des Sets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
trainset, testset = train_test_split(df, test_size=0.2, random_state=0)

In [None]:
print(trainset['Notes'].value_counts())
print(testset['Notes'].value_counts())

# Encodage

In [None]:
def encodage(df):
    code = {'GP':1,
            'MS':0,

            'M':1,
            'F':0,

            'U':1,
            'R':0,

            'GT3':1,
            'LE3':0,

            'A':1,
            'T':0,

            'at_home':4,
            'health':3,
            'services':2,
            'teacher':1,

            'other':0,

            'course':1,
            'home':2,
            'reputation':3,

            'mother':2,
            'father':1,

            'yes':1,
            'no':0

           }
    for col in df.select_dtypes('object'):
        df.loc[:,col]=df[col].map(code)
        
    return df

In [None]:
def feature_engineering(df):
    df['Notes'] = ((df['G1']+df['G2']+df['G3'])/3)//7 
# 0 = Notes compriese entre 0 et 7
# 1 = Notes comprises entre 7 et 14
# 2 = Notes comprises entre 14 et 21 (donc 20 comme c'est le maximum)
    df = df.drop('G1', axis=1)
    df = df.drop('G2', axis=1)
    df = df.drop('G3', axis=1)
    return df

In [None]:
def imputation(df):
    
    #df['is na'] = df['Parainfluenza 4'].isna() | df['Leukocytes'].isna()
    #df = df.fillna(-999)
    
    df = df.dropna(axis=0)
    
    return df

In [None]:
def preprocessing(df):
    df = encodage(df)
    df = feature_engineering(df)
    df = imputation(df)
    
    X = df.drop('Notes',axis=1)
    y = df['Notes']
    
    print(y.value_counts())
    
    return X,y

In [None]:
X_train, y_train = preprocessing(trainset)
X_test, y_test = preprocessing(testset)

## Modélisation

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA

In [None]:
model = RandomForestClassifier(random_state=0)

## Procédure d'évaluation du modèle (identique pour chaque modèle à tester)

In [None]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.model_selection import learning_curve

In [None]:
def evaluation(model):
    model.fit(X_train, y_train)
    ypred = model.predict(X_test)
    
    print(model.score(X_test,y_test))
    print(confusion_matrix(y_test,ypred))
    print(classification_report(y_test,ypred))
    
    N, train_score, val_score = learning_curve(model, X_train, y_train, 
                                               cv=4, scoring='f1', 
                                               train_sizes=np.linspace(0.1,1,10))
    plt.figure(figsize=(12,8))
    plt.plot(N, train_score.mean(axis=1), label='train score')
    plt.plot(N, val_score.mean(axis=1), label='validation score')
    plt.legend()

In [None]:
evaluation(model)

In [None]:
pd.DataFrame(model.feature_importances_, index=X_train.columns).plot.bar(figsize=(12, 8))