In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("df.csv")
df.head(5)

In [None]:
new_columns = ['Time','Gender','Age','Major','Year','CGPA','Marriage','Depression','Anxiety','Panic','Treatment']
df.columns = new_columns
df.head()

In [None]:
df.describe()

In [None]:
df.duplicated().sum()

In [None]:
df.isna().sum()
df.dropna(inplace=True)
df.isna().sum()

# EDA - Explatory Data Analysis

In [None]:
df['Age'].hist()
plt.title('Age distribution of participants')
plt.xlabel('Age')
plt.ylabel('Frequency')

In [None]:
sns.boxplot(data=df,x='Depression',y='Age')
plt.title('Depression by Age')

In [None]:
sns.histplot(df['CGPA'].sort_values(),kde=True)
plt.title('CGPA distribution')

In [None]:
fig = plt.figure(figsize=(8,6))
plt.subplot(1,2,1)
sns.countplot(data=df,x='Marriage',hue='Anxiety')
plt.title('Marriage frequency')

plt.subplot(1,2,2)
sns.countplot(data=df, x= 'Depression',hue='Marriage')
plt.title("Depression frequency")

In [None]:
df.head()

# Data preprocessing
* Clean the data
* Turn categorial data into numerical data
* Normalize the data

In [None]:
df.head()

In [None]:
cols = ['Marriage','Depression','Panic','Anxiety','Treatment']

def check(x):
    if x == 'Yes':
        return 1
    else:
        return 0

for i in cols:
    df[i] = df[i].apply(lambda x:1 if x =='Yes' else 0)

In [None]:
df['Year']=df['Year'].apply(lambda x: int(x[-1:])) #Turn the string to int
df

In [None]:
df['CGPA'].unique()

In [None]:
df['CGPA'].value_counts().sort_values()

def change_cgpa(x):
    if (x == '3.50 - 4.00' or x == '3.50 - 4.00 '):
        x = 5
        return x
    elif x=='3.00 - 3.49' :
        x = 4
        return x
    elif x == '2.50 - 2.99':
        x = 3
        return x
    elif x== '2.00 - 2.49':
        x = 2
        return x
    else:
        x=1
        return x

df['CGPA']=df['CGPA'].apply(lambda x:change_cgpa(x))

In [None]:
df.head()

In [None]:
# 0 for female, 1 for male
df['Gender'] = df['Gender'].apply(lambda x: 0 if x == 'Female' else 1)

In [None]:
df['Major'].value_counts()

In [None]:
from sklearn.preprocessing import  LabelEncoder
le = LabelEncoder()
df['Major'] = le.fit_transform(df['Major'])
# The Major dataset skiped over the '42th entry', so can't use df['Major'] = pd.DataFrame(le.fit_transform(df['Major']))

In [None]:
# df['Time'][1][0:8].split('/')
# df['Day'] = df['Time'].apply(lambda x:x[0:8].split('/')[0])
# df['Month'] = df['Time'].apply(lambda x:x[0:8].split('/')[1])
# df['Date_Year'] = df['Time'].apply(lambda x:x[0:8].split('/')[2])

In [None]:
df.drop('Time',axis=1,inplace=True)

In [None]:
df.head()

#### Our target is depression

In [None]:
# Splitting the data
from sklearn.model_selection import train_test_split

X=df.drop(['Depression'],axis=1).values
y=df['Depression'].values

X.shape,y.shape


In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,test_size=0.2)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
norm_X_train = scaler.fit_transform(X_train).astype(int)
norm_X_test = scaler.transform(X_test).astype(int)


# Testing Different models

In [None]:
from sklearn.preprocessing import StandardScaler
X = df.drop(['Depression'], axis = 1)
y = df['Depression']
std = StandardScaler()
X = std.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
model_name = []
model_accuracy = []
def test_model(model, name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_pred, y_test)
    model_name.append(name)
    model_accuracy.append(accuracy)
    cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model.classes_)
    print(accuracy)
    disp.plot()
    plt.show()

# 1. . Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay


model = RandomForestClassifier(n_estimators = 5)
test_model(model, "Random Forest Classifier")

# 2. Support vector machine

In [None]:
from sklearn.svm import SVC

model = SVC(C=1.2, kernel='rbf')
test_model(model, "Support Vector Machine")

# 3. Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state = 1)
test_model(model, "Decision Tree Classifier")

# # 4. Kneighbor classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=5)
test_model(model, "KNeighbors Classifier")

# 6 Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model_logistic = LogisticRegression()
test_model(model_logistic, "Logistic Regression")

# Plotting accuracies of different models

In [None]:
fig = plt.figure(figsize = (20, 5))
plt.bar(model_name, model_accuracy, width = 0.2, color='green')
plt.xlabel("Models", fontsize = 15)
plt.ylabel("accuracy", fontsize = 15)
# plt.xticks()
plt.show()

In [None]:
pip install onnx

In [None]:
pip install protobuf==3.20.2

In [None]:
pip install skl2onnx

In [None]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Your data preprocessing and splitting code here...

# Define a function to evaluate and log the model
def evaluate_and_log(model, model_name, version, params):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    eval_acc = accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred)

    # Start a new MLflow run
    with mlflow.start_run() as run:
        # Log model parameters
        mlflow.log_params(params)
        
        # Log other metrics
        mlflow.log_metric("eval_acc", eval_acc)
        mlflow.log_metric("auc_score", auc_score)

        # Log confusion matrix as an artifact
        plt.clf()
        conf_matrix = confusion_matrix(y_test, y_pred)
        ax = sns.heatmap(conf_matrix, annot=True, fmt='g')
        ax.invert_xaxis()
        ax.invert_yaxis()
        plt.ylabel('Actual')
        plt.xlabel('Predicted')
        plt.title("Confusion Matrix")
        conf_matrix_path = "conf_matrix.png"
        plt.savefig(conf_matrix_path)
        mlflow.log_artifact(conf_matrix_path)

        # Log the model
        mlflow.sklearn.log_model(model, model_name)

        # Log version information
        mlflow.set_tag("version", version)

# Test and log each model
models = [
    (RandomForestClassifier(n_estimators=5), "RandomForest", "v1", {"n_estimators": 5}),
    (SVC(C=1.2, kernel='rbf'), "SVM", "v1", {"C": 1.2, "kernel": "rbf"}),
    (DecisionTreeClassifier(random_state=1), "DecisionTree", "v1", {"random_state": 1}),
    (KNeighborsClassifier(n_neighbors=5), "KNeighbors", "v1", {"n_neighbors": 5}),
    (LogisticRegression(), "LogisticRegression", "v1", {})
]

for model, model_name, version, params in models:
    evaluate_and_log(model, model_name, version, params)
    print(f"Model run for {model_name} version {version}")


In [None]:
import onnx
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
initial_type = [('float_input', FloatTensorType([None, X_train.shape[1]]))]
onnx_model = convert_sklearn(model, initial_types=initial_type)
onnx.save_model(onnx_model, 'best_model.onnx')

In [None]:
import pickle

# Supposons que vous ayez un objet 'scaler' qui est votre modèle de prétraitement
with open('preprocessing_model.pkl', 'wb') as f:
    pickle.dump(scaler, f)


In [2]:
import sklearn
sklearn.__version__

'1.3.2'