In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'mental-disorder-classification:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4352146%2F7476679%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240716%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240716T174855Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Daaf0374fe9d36909b732d33b3672f25736fd3a188ba82f37dcd54038e5dc0d1b5d78420ccf61aefcb04df0bb6c8c83dd0de373d327003236366935a021407dc61031bd2a6c05664b04b6d761f4d7d04bd1121fd950466a3d56332bde5019597ea29e91f1a406d4f19622e03135a5cd8c99dc5129d42fa29f43946c495a6dea23dc81a619b9589a445fe091f2a728c34304c7935c24e21b2c1c86fad46d787c3b93f3c89480134f60601a1628ae4ef2589cbd786c72bff15e6ebd5c219630eead626f12a8e61aa20c403907ac1d725c57df79cd419bea1b08776e77b3ea2190a833f493a885df7e65c3ec4e967ecf10de441911427d49bdf5d2536ead90cf2712'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading mental-disorder-classification, 2226 bytes compressed
Downloaded and uncompressed: mental-disorder-classification
Data source import complete.


In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sb
import plotly.subplots as sp
import plotly.graph_objs as go
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Importing dataset
data=pd.read_csv('/kaggle/input/mental-disorder-classification/Dataset-Mental-Disorders.csv')
data.head(2)

Unnamed: 0,Patient Number,Sadness,Euphoric,Exhausted,Sleep dissorder,Mood Swing,Suicidal thoughts,Anorxia,Authority Respect,Try-Explanation,Aggressive Response,Ignore & Move-On,Nervous Break-down,Admit Mistakes,Overthinking,Sexual Activity,Concentration,Optimisim,Expert Diagnose
0,Patiant-01,Usually,Seldom,Sometimes,Sometimes,YES,YES,NO,NO,YES,NO,NO,YES,YES,YES,3 From 10,3 From 10,4 From 10,Bipolar Type-2
1,Patiant-02,Usually,Seldom,Usually,Sometimes,NO,YES,NO,NO,NO,NO,NO,NO,NO,NO,4 From 10,2 From 10,5 From 10,Depression


In [None]:
data.dropna()

Unnamed: 0,Patient Number,Sadness,Euphoric,Exhausted,Sleep dissorder,Mood Swing,Suicidal thoughts,Anorxia,Authority Respect,Try-Explanation,Aggressive Response,Ignore & Move-On,Nervous Break-down,Admit Mistakes,Overthinking,Sexual Activity,Concentration,Optimisim,Expert Diagnose
0,Patiant-01,Usually,Seldom,Sometimes,Sometimes,YES,YES,NO,NO,YES,NO,NO,YES,YES,YES,3 From 10,3 From 10,4 From 10,Bipolar Type-2
1,Patiant-02,Usually,Seldom,Usually,Sometimes,NO,YES,NO,NO,NO,NO,NO,NO,NO,NO,4 From 10,2 From 10,5 From 10,Depression
2,Patiant-03,Sometimes,Most-Often,Sometimes,Sometimes,YES,NO,NO,NO,YES,YES,NO,YES,YES,NO,6 From 10,5 From 10,7 From 10,Bipolar Type-1
3,Patiant-04,Usually,Seldom,Usually,Most-Often,YES,YES,YES,NO,YES,NO,NO,NO,NO,NO,3 From 10,2 From 10,2 From 10,Bipolar Type-2
4,Patiant-05,Usually,Usually,Sometimes,Sometimes,NO,NO,NO,NO,NO,NO,NO,YES,YES,YES,5 From 10,5 From 10,6 From 10,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,Patiant-116,Most-Often,Seldom,Usually,Sometimes,NO,YES,NO,NO,YES,NO,YES,NO,NO,YES,2 From 10,5 From 10,3 From 10,Depression
116,Patiant-117,Sometimes,Sometimes,Sometimes,Seldom,YES,NO,NO,NO,NO,YES,NO,NO,NO,YES,6 From 10,7 From 10,8 From 10,Bipolar Type-1
117,Patiant-118,Usually,Sometimes,Usually,Sometimes,YES,NO,YES,YES,NO,NO,NO,YES,NO,YES,1 From 10,5 From 10,3 From 10,Bipolar Type-2
118,Patiant-119,Usually,Sometimes,Seldom,Seldom,NO,YES,YES,NO,YES,YES,YES,NO,YES,YES,7 From 10,7 From 10,7 From 10,Depression


In [None]:
data['Mood Swing'].replace('YES ', 'YES', inplace=True)

In [None]:
data['Mood Swing'].value_counts()

Mood Swing
NO     63
YES    57
Name: count, dtype: int64

In [None]:
data
data.rename(columns={'Ignore & Move-On': 'ignore_and__move_on'}, inplace=True)

In [None]:
Yes_No_column = ['Mood Swing', 'Suicidal thoughts', 'Anorxia', 'Authority Respect', 'Try-Explanation', 'Aggressive Response', 'ignore_and__move_on', 'Nervous Break-down', 'Admit Mistakes', 'Overthinking']

for column in Yes_No_column:
    data[column] = data[column].replace({'YES ': 'YES', ' NO ': 'NO', 'NO ': 'NO', 'YES': 'YES', 'NO': 'NO'})
    data[column] = data[column].map({'YES': 1, 'NO': 0})


In [None]:
data.drop(columns=['Patient Number'], inplace=True)

In [None]:
for column in ['Overthinking', 'Sexual Activity', 'Concentration','Optimisim']:
    data[column] = data[column].astype(str).str.extract(r'(\d+)').astype(int)

In [None]:
X=data.drop(columns=['Expert Diagnose'])
y=data['Expert Diagnose']

In [None]:
X=pd.get_dummies(X, columns=['Sadness','Euphoric','Exhausted','Sleep dissorder'])

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

feature_selection = SelectKBest(score_func=mutual_info_classif, k=10)
feature_selection = feature_selection.fit(X, y)


features_scores = pd.Series(feature_selection.scores_, index=X.columns)
top_features = features_scores.nlargest(10).index


x_train, x_test, y_train, y_test = train_test_split(X[top_features], y, test_size=0.3, random_state=2, shuffle=True)


param_grid = {
    'bootstrap': [True],
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 13, 22],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [4, 7, 9],
    'max_features': ['sqrt', 'log2']
}


model = RandomForestClassifier()

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

grid_search.fit(x_train, y_train)


y_predict = grid_search.predict(x_test)


print(classification_report(y_test, y_predict))


Fitting 3 folds for each of 162 candidates, totalling 486 fits
              precision    recall  f1-score   support

           0       1.00      0.88      0.93         8
           1       0.91      1.00      0.95        10
           2       0.78      0.88      0.82         8
           3       0.89      0.80      0.84        10

    accuracy                           0.89        36
   macro avg       0.89      0.89      0.89        36
weighted avg       0.89      0.89      0.89        36



In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline

feature_selection = SelectKBest(score_func=mutual_info_classif, k=10)
feature_selection = feature_selection.fit(X, y)


features_scores = pd.Series(feature_selection.scores_, index=X.columns)
top_features = features_scores.nlargest(10).index


x_train, x_test, y_train, y_test = train_test_split(X[top_features], y, test_size=0.3, random_state=2, shuffle=True)



param_grid = {
    'criterion': ['gini'], 'max_depth': [4], 'max_features': ['auto'], 'n_estimators': [200]
}

# Initialize the model
rf = RandomForestClassifier()

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')


# Fit the model
grid_search.fit(x_train, y_train)

# Print the best parameters
print("Best parameters found: ", grid_search.best_params_)



# Evaluate the model with the best parameters
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(x_test)
print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 1 candidates, totalling 3 fits


  warn(


Best parameters found:  {'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 200}
              precision    recall  f1-score   support

           0       1.00      0.88      0.93         8
           1       0.91      1.00      0.95        10
           2       0.67      1.00      0.80         8
           3       1.00      0.60      0.75        10

    accuracy                           0.86        36
   macro avg       0.89      0.87      0.86        36
weighted avg       0.90      0.86      0.86        36



In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline

feature_selection = SelectKBest(score_func=mutual_info_classif, k=10)
feature_selection = feature_selection.fit(X, y)


features_scores = pd.Series(feature_selection.scores_, index=X.columns)
top_features = features_scores.nlargest(10).index


x_train, x_test, y_train, y_test = train_test_split(X[top_features], y, test_size=0.3, random_state=2, shuffle=True)



param_grid = {
    'criterion': ['gini'], 'max_depth': [4], 'max_features': ['auto'], 'n_estimators': [200]
}

# Initialize the model
# rf = LogisticRegression()
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression())
])

# Define the parameter grid
param_grid = {
    'logreg__C': [ 10],
    'logreg__penalty': ['l1'],
    'logreg__solver': [ 'liblinear'],
    'logreg__max_iter': [100]
}


# Grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(x_train, y_train)

# Print the best parameters
print("Best parameters found: ", grid_search.best_params_)



# Evaluate the model with the best parameters
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(x_test)
print(classification_report(y_test, y_pred))


Best parameters found:  {'logreg__C': 10, 'logreg__max_iter': 100, 'logreg__penalty': 'l1', 'logreg__solver': 'liblinear'}
              precision    recall  f1-score   support

           0       1.00      0.75      0.86         8
           1       0.83      1.00      0.91        10
           2       1.00      1.00      1.00         8
           3       1.00      1.00      1.00        10

    accuracy                           0.94        36
   macro avg       0.96      0.94      0.94        36
weighted avg       0.95      0.94      0.94        36

