In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the dataset
file_path = "/kaggle/input/mental-health-dataset/Mental Health Dataset.csv"
data = pd.read_csv(file_path)

# Drop rows with missing values
data = data.dropna()

# Encode categorical variables
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Separate features and target variable
X = data.drop(columns=["treatment"])
y = data["treatment"]

# Feature Selection
selector = SelectKBest(score_func=f_classif, k=10)  # Select top 10 features
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
print(f"Selected Features: {selected_features.tolist()}")

# Standardize the selected features
scaler = StandardScaler()
X_selected = scaler.fit_transform(X_selected)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42, stratify=y)

# Hyperparameter Tuning for Random Forest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
best_rf_model = grid_search.best_estimator_
print(f"Best Random Forest Parameters: {grid_search.best_params_}")

# Train Gradient Boosting with optimized parameters
gb_model = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
gb_model.fit(X_train, y_train)

# Model Evaluation
def evaluate_model(model, name):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n{name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    
    conf_matrix = confusion_matrix(y_test, y_pred)
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
    plt.title(f"Confusion Matrix - {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

evaluate_model(best_rf_model, "Random Forest")
evaluate_model(gb_model, "Gradient Boosting")

# Make predictions
rf_predictions = best_rf_model.predict(X_test)
gb_predictions = gb_model.predict(X_test)

# Save predictions to CSV
predictions_df = pd.DataFrame({
    "Actual": y_test.values,
    "Random_Forest_Predictions": rf_predictions,
    "Gradient_Boosting_Predictions": gb_predictions
})

predictions_df.to_csv("predictions.csv", index=False)
print("Predictions saved to predictions.csv successfully!")



In [None]:
df = pd.read_csv("/kaggle/working/predictions.csv")
print(df)

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
nltk.download('punkt')
nltk.download('stopwords')

# Load dataset
data = pd.read_csv("/kaggle/input/mental-health-dataset/Mental Health Dataset.csv")

# Drop Timestamp column as it isn't useful
data = data.drop(columns=['Timestamp'])

# Text cleaning function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    return text

# Feature extraction function
def extract_features(text):
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_words = [w for w in words if w not in stop_words]
    
    return {
        'token_count': len(words),
        'sentence_length': len(text.split()),
        'stopword_count': len(words) - len(filtered_words),
        'negative_words': sum(1 for w in words if w in ['bad', 'sad', 'stress', 'anxiety'])
    }

# Apply text preprocessing
data['Cleaned_Text'] = data['care_options'].apply(clean_text)
features = data['Cleaned_Text'].apply(extract_features).apply(pd.Series)

# TF-IDF feature extraction
tfidf = TfidfVectorizer(max_features=500)
tfidf_features = tfidf.fit_transform(data['Cleaned_Text']).toarray()
tfidf_df = pd.DataFrame(tfidf_features, columns=tfidf.get_feature_names_out())

# Handling the problematic 'Days_Indoors' column
# Convert to a categorical feature using one-hot encoding
data = pd.get_dummies(data, columns=['Days_Indoors'], prefix='days')

# Encode categorical features
categorical_cols = ['Gender', 'Country', 'Occupation', 'self_employed', 
                   'family_history', 'treatment', 'mental_health_interview', 
                   'Growing_Stress', 'Changes_Habits', 'Mental_Health_History',
                   'Coping_Struggles', 'Work_Interest', 'Social_Weakness']

# Label Encoding for categorical columns
label_enc = LabelEncoder()
for col in categorical_cols:
    if col in data.columns:  # Check if column exists
        if data[col].dtype == 'object' or data[col].dtype == 'bool':  # Only encode object or boolean types
            data[col] = label_enc.fit_transform(data[col].astype(str))

# Encode Mood_Swings if not already numeric
if data['Mood_Swings'].dtype == 'object':
    data['Mood_Swings'] = label_enc.fit_transform(data['Mood_Swings'])

# One-Hot Encoding for categorical columns with many unique values
# Country and Occupation have already been handled with get_dummies

# Drop processed text column and concat with feature dataframes
data = pd.concat([data.drop(columns=['Cleaned_Text', 'care_options']), features, tfidf_df], axis=1)

# Print dtypes to verify all columns are numeric before modeling
print(data.dtypes)

# Define target and features
X = data.drop(columns=['Mood_Swings'])  # Target is Mood_Swings
y = data['Mood_Swings']

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model selection
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

best_model = None
best_score = 0
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    score = accuracy_score(y_test, preds)
    print(f"{name} Accuracy: {score:.4f}")
    if score > best_score:
        best_score = score
        best_model = model

# Fine-tuning using Grid Search
param_grid = {
    'n_estimators': [50, 100, 200], 
    'max_depth': [10, 20, None]
} if isinstance(best_model, RandomForestClassifier) else {
    'C': [0.1, 1, 10]
} if isinstance(best_model, LogisticRegression) else {
    'max_depth': [10, 20, None]
}

tuned_model = GridSearchCV(best_model, param_grid, cv=5, scoring='accuracy')
tuned_model.fit(X_train, y_train)
y_pred = tuned_model.predict(X_test)

# Display results
print("Best Model Report:")
print(classification_report(y_test, y_pred))

# Save predictions
submission = pd.DataFrame({'ID': X_test.index, 'Mood_Swings': y_pred})
submission.to_csv('predictionss.csv', index=False)
print("Predictions saved to 'predictionss.csv'.")

In [None]:
df = pd.read_csv("/kaggle/working/predictionss.csv")
print(df)