### Objectif: 
Prédire le trouble de sommeil chez les éleves.

In [22]:
# installations
# ! pip install pandas matplotlib seaborn scikit-learn 
# imports
from zipfile import ZipFile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import   RidgeClassifier
from sklearn.metrics import accuracy_score
import pickle


In [23]:
# charger les données

def get_data(file_name):
 
    # ouvrir le fichier zip et extraires les fichiers dedans
    with ZipFile(file_name, 'r') as zip:
    # zip.printdir() # pour voir la liste des fichiers dans le zip
        zip.extractall()
        print('Done!')
    df = pd.read_csv("Health and Sleep relation 2024/Sleep_health_and_lifestyle_dataset.csv")
    df = df[['Gender', 'Age', 'Occupation', 'Sleep Duration',
           'Quality of Sleep', 'Physical Activity Level', 'Stress Level',
           'BMI Category', 'Blood Pressure', 'Heart Rate', 'Daily Steps',
           'Sleep Disorder']]
    return df

### Transformations

In [24]:
# fonction de tra,sformation des données
def transform_data(data):
    df = data.copy()
# gèrer les valeurs manquantes
    df.isna().sum()/df.shape[0]
    df = df.dropna() # Il n'y a que la colonne "Sleep Disorder" avec 58% de nan. Je supprime les lignes avec une nan

# verfier s'il ya des profils dupliqués et nétoyer
    df.duplicated().sum()/df.shape[0]
    df.drop_duplicates(inplace=True)

# encodage des colonnes avec des données catégorielles
    encoder  = LabelEncoder()

# encodage manuel
    df["Gender"] =  df["Gender"].apply(lambda x: 1 if x == "Male" else 0)
    df["Sleep Disorder"] = df["Sleep Disorder"].apply(lambda x: 1 if x == 'Sleep Apnea' else 0)

# encodage avec LabelEncoder de sklearn
    df["BMI Category"] = encoder.fit_transform(df["BMI Category"])
    df["Occupation"] = encoder.fit_transform(df["Occupation"])

# transformer la colonne Blood Pressure en %
    df[["Actual_Blood_Pressure", "Ref_Blood_Pressure"]]=  df["Blood Pressure"].str.split(pat="/", n=1, expand=True)
    df["Blood Pressure (%)"] = round(df["Actual_Blood_Pressure"].astype(float)/ df["Ref_Blood_Pressure"].astype(float),2)
    df = df.drop(["Blood Pressure", "Actual_Blood_Pressure", "Ref_Blood_Pressure"], axis=1)

# traduire les titres des colonnes en français
    df = df.rename(columns={
        "Gender": "sexe",
        "Age": "age",
        "Occupation": "travail",
        "Sleep Duration": "duree_sommeil",
        "Quality of Sleep": "qualite_sommeil",
        "Physical Activity Level": "activite_physique",
        "Stress Level": "niveau_stress",
        "BMI Category": "imc_categorie",
        "Heart Rate": "frequence_cardiaque",
        "Daily Steps": "pas_quotidiens",
        "Sleep Disorder": "trouble_sommeil",
        "Blood Pressure (%)": "pression_arterielle"
    })
    return df

In [25]:
# Affichage de la distribution
def analyse_distribution(cleaned_data):
    df = cleaned_data.copy()

    for col in ["sexe", "trouble_sommeil", "imc_categorie", "travail", "pression_arterielle"]:
        df[col].value_counts().plot(kind="bar")
        plt.show()

### Modélisation

In [26]:
# fonction de construction du modèle
def build_model(cleaned_data):

    df = cleaned_data.copy()
    # Split data en features x_train et cible y_train
    x = df.drop(["trouble_sommeil"], axis=1)
    y = df["trouble_sommeil"]

    x_train, x_test,  y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

    # normaliser 
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)

    # entrainer model
    classifier = RidgeClassifier()

    model= classifier.fit(x_train_scaled, y_train)

    # predict
    y_pred =model.predict(x_test_scaled)

    # score
    score = accuracy_score(y_pred, y_test)
    score = f"{round(score*100, 2)}%"

    return model, score, scaler


### Sauvegarder le modèle

In [27]:
# appel de fonctions pour récupèrer le modèle entrainé
raw_data = get_data("data.zip")
cleaned_data = transform_data(raw_data)
model, score, scaler = build_model(cleaned_data)

Done!


In [28]:
# sauvegarder le modèle pour réutilisation
with open("model_files/saved_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)
with open("model_files/scaler.pkl", "wb") as scaler_file:
    pickle.dump(scaler, scaler_file)
with open("model_files/score.pkl", "wb") as score_file:
    pickle.dump(score, score_file)