In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

In [None]:
data1_path = '/content/data1.csv'
data2_path = '/content/data2.csv'
data3_path = '/content/data3 (1).csv'
test_data_path = '/content/modifiedagaintest - test.csv.csv'

data1 = pd.read_csv(data1_path)
data2 = pd.read_csv(data2_path).drop(columns=['name'])  # Drop 'name' column immediately
data3 = pd.read_csv(data3_path)

In [None]:
train_data = data1.merge(data2, on='id').merge(data3, on='id')
drop_columns = ['username', 'mail', 'address', 'birthdate', 'sex']
train_data = train_data.drop(columns=drop_columns)
train_data.to_csv('modified_train_data.csv', index=False)

In [None]:
X_train = train_data.drop(columns=['Well Being', 'name', 'id'])
y_train = train_data['Well Being']

imputer = SimpleImputer(strategy='most_frequent')
X_train = imputer.fit_transform(X_train)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)

xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0],
    'gamma': [0, 1, 5],
}

random_search = RandomizedSearchCV(xgb_clf, param_distributions=param_grid, n_iter=10, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

In [None]:
best_xgb_clf = random_search.best_estimator_

y_pred = best_xgb_clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

In [None]:
test_data = pd.read_csv(test_data_path)
test_data_features = test_data.drop(columns=drop_columns + ['id', 'name'])

imputer_median = SimpleImputer(strategy='median')
test_data_features = imputer_median.fit_transform(test_data_features)

predictions = best_xgb_clf.predict(test_data_features)

In [None]:
predictions_labels = le.inverse_transform(predictions)
submission = pd.DataFrame({
    'id': test_data['id'],
    'Well Being': predictions_labels
})
submission.to_csv('submission_3.csv', index=False)

Parameters: { "use_label_encoder" } are not used.



Validation Accuracy: 0.9301692701246395
