In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# File paths
data1_path = '/content/data1.csv'
data2_path = '/content/data2.csv'
data3_path = '/content/data3 (1).csv'
test_data_path = '/content/modifiedagaintest - test.csv.csv'

# Read and process training data
data1 = pd.read_csv(data1_path)
data2 = pd.read_csv(data2_path).drop(columns=['name'])  # Drop 'name' column immediately
data3 = pd.read_csv(data3_path)

# Merge all datasets on 'id'
train_data = data1.merge(data2, on='id').merge(data3, on='id')

# Drop unnecessary columns in one go
drop_columns = ['username', 'mail', 'address', 'birthdate', 'sex']
train_data = train_data.drop(columns=drop_columns)

# Save processed train data
train_data.to_csv('modified_train_data.csv', index=False)

# Prepare training features (X) and labels (y)
X_train = train_data.drop(columns=['Well Being', 'name', 'id'])
y_train = train_data['Well Being']

# Handle missing values using SimpleImputer with mode strategy
imputer = SimpleImputer(strategy='most_frequent')
X_train = imputer.fit_transform(X_train)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train the model (RandomForest for now, but can be replaced with XGBoost/LightGBM for better efficiency)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Validate the model
y_pred = clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

# Test Data Processing
test_data = pd.read_csv(test_data_path)
test_data_features = test_data.drop(columns=drop_columns + ['id', 'name'])

# Impute missing values in test data using median strategy
imputer_median = SimpleImputer(strategy='median')
test_data_features = imputer_median.fit_transform(test_data_features)

# Predict on test data
predictions = clf.predict(test_data_features)

# Prepare submission file
submission = pd.DataFrame({
    'id': test_data['id'],
    'Well Being': predictions
})

# Save to CSV
submission.to_csv('submission.csv', index=False)


Validation Accuracy: 0.885538562020356
