In [4]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

In [5]:
df = pd.read_csv('data/csv/user_data.csv') 
df.columns
removed_cols = ['id', 'hobbies', 'average_positive_probability_before', 
                'average_positive_probability_after', 'diff','score_change', 
                'count_id_after','count_id_before','urban_rural','race_ethnicity']
df = df.drop(columns=removed_cols)
df.shape

df = df.apply(pd.to_numeric, errors='coerce')  # Convert all columns to numeric
df = df.fillna(df.mean())  # Impute missing values with mean

# Define categorical columns
categorical_columns = ['gender', 'socio_economic_status', 'parental_involvement', 'Openness',
                       'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism']

# Convert categorical columns to one-hot encoding
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Define features (X) and target (y)
X = df.drop(columns=['mental_health_score_after'])
y = df['mental_health_score_after']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
svm_model = make_pipeline(StandardScaler(), SVR())
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

In [8]:
# Predict on the test data
y_pred = svm_model.predict(X_test)

# Calculate Mean Squared Error and R^2 score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'R-squared (R2): {r2:.2f}')

Mean Squared Error (MSE): 571.69
R-squared (R2): 0.05
