### Imports

In [56]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import xgboost as xgb
import regex as re
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import pickle
import os
import joblib
import time
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

### Encoding

In [57]:
# Encoding functions
def one_hot_encode(data, columns):
    """One-hot encodes specified columns in the dataframe."""
    return pd.get_dummies(data, columns=columns, drop_first=True)

def label_encode(data, columns):
    """Label encodes specified columns in the dataframe."""
    for col in columns:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col])
    return data

def frequency_encode(data, columns):
    """Frequency encodes specified columns in the dataframe."""
    for col in columns:
        value_counts = Counter(data[col])
        data[col] = data[col].map(lambda x: value_counts[x] / len(data))
    return data

def target_encode(data, target_column, columns):
    """Target encodes specified columns by replacing categories with the mean of the target variable."""
    for col in columns:
        target_mean = data.groupby(col)[target_column].mean()
        data[col] = data[col].map(target_mean)
    return data

### Imputation

In [58]:
# Imputation functions
def impute_mean(data, column):
    data[column].fillna(data[column].mean(), inplace=True)
    return data

def impute_median(data, columns):
    imputer = SimpleImputer(strategy="median")
    data[columns] = imputer.fit_transform(data[columns])
    return data

def impute_mode(data, column):
    data[column].fillna(data[column].mode()[0], inplace=True)
    return data

def impute_knn(data, columns, n_neighbors=5):
    imputer = KNNImputer(n_neighbors=n_neighbors)
    data[columns] = imputer.fit_transform(data[columns])
    return data

def impute_random_forest(data, features, target_column):
    imputer = RandomForestRegressor()
    known_data = data.dropna(subset=[target_column])
    imputer.fit(known_data[features], known_data[target_column])
    missing_idx = data[target_column].isnull()
    data.loc[missing_idx, target_column] = imputer.predict(data.loc[missing_idx, features])
    return data

def impute_regression(data, columns):
    for col in columns:
        features = [c for c in data.columns if c != col]
        model = LinearRegression()
        model.fit(data[features], data[col])
        data.loc[data[col].isnull(), col] = model.predict(data[features].loc[data[col].isnull()])
    return data

### Evaluation

In [59]:
# Regression evaluation metrics
def evaluate_regression(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f'Mean Squared Error: {mse}')
    print(f'Mean Absolute Error: {mae}')
    print(f'R^2 Score: {r2}')
    return {'mse': mse, 'mae': mae, 'r2': r2}

# Classification evaluation metrics
def evaluate_classification(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    cm = confusion_matrix(y_true, y_pred)
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')
    print(f'Confusion Matrix:\n{cm}')
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, 'confusion_matrix': cm}

### Get Info

In [60]:
# Sample inputs from the user
df = pd.read_csv('mentalhealth.csv')
target_column = 'Depression'  # Updated for clarity, no need for f-string
problem_type = 'Classification'.title()

# Beginning of automation
def dataset_summary(data):
    rows, _ = data.shape
    properties = []

    for column in data.columns:
        null_count = int(data[column].isna().sum())
        null_percentage = round((null_count / rows) * 100, 2) if null_count > 0 else 0
        unique = data[column].nunique()
        dtype = data[column].dtype

        properties.append({
            'Column': column,
            'Type': dtype,
            'Null Count': null_count,
            'Null Percentage (%)': null_percentage,
            'Unique Values': unique
        })

    summary_df = pd.DataFrame(properties)
    return summary_df

def target_distribution(data, target_column):
    # Calculate counts and percentages for each class
    class_counts = data[target_column].value_counts()
    class_percentages = (class_counts / len(data)) * 100

    # Create a dataframe for visualization
    class_distribution = pd.DataFrame({
        'Class': class_counts.index,
        'Count': class_counts.values,
        'Percentage (%)': class_percentages.values
    })
    
    return class_distribution  # Correct placement of return

print(f'Dataset size: {df.shape}, Target column: {target_column}, Problem type: {problem_type}\n')

print('Head:')
print(df.head().to_string())

print('\nTarget distribution:')
print(target_distribution(df, target_column).to_string())

print('\nDescription:')
print(df.describe().to_string())

print('\nSummary:')
print(dataset_summary(df).to_string())

print('\nCorrelation:')
print(df.corr().to_string())


Dataset size: (140700, 20), Target column: Depression, Problem type: Classification

Head:
   id      Name  Gender   Age           City Working Professional or Student        Profession  Academic Pressure  Work Pressure  CGPA  Study Satisfaction  Job Satisfaction     Sleep Duration Dietary Habits   Degree Have you ever had suicidal thoughts ?  Work/Study Hours  Financial Stress Family History of Mental Illness  Depression
0   0  Aaradhya  Female  49.0       Ludhiana            Working Professional              Chef                NaN            5.0   NaN                 NaN               2.0  More than 8 hours        Healthy      BHM                                    No               1.0               2.0                               No           0
1   1     Vivan    Male  26.0       Varanasi            Working Professional           Teacher                NaN            4.0   NaN                 NaN               3.0  Less than 5 hours      Unhealthy      LLB                        