In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

# Load datasets
train_features = pd.read_csv('/content/training_set_features.csv')
train_labels = pd.read_csv('/content/training_set_labels.csv')
test_features = pd.read_csv('/content/test_set_features.csv')

# Merge training features with labels
train_data = pd.merge(train_features, train_labels, on='respondent_id')

# Drop unnecessary columns
object_cols = train_data.select_dtypes(include='object').columns
train_data.drop(columns=object_cols, inplace=True)
train_data.drop(columns=['respondent_id'], inplace=True)
test_respondent_ids = test_features['respondent_id']  # Save respondent_ids for final submission

# Drop object columns from test features as well
test_features.drop(columns=object_cols, inplace=True)
test_features.drop(columns=['respondent_id'], inplace=True)

# Handle missing values (example: forward fill)
train_data.fillna(method='ffill', inplace=True)
test_features.fillna(method='ffill', inplace=True)

# Separate features and target variables
X = train_data.drop(columns=['xyz_vaccine', 'seasonal_vaccine'])
y_xyz = train_data['xyz_vaccine']
y_seasonal = train_data['seasonal_vaccine']

# Split data into train and validation sets
X_train, X_valid, y_train_xyz, y_valid_xyz = train_test_split(X, y_xyz, test_size=0.2, random_state=42)
X_train_seasonal, X_valid_seasonal, y_train_seasonal, y_valid_seasonal = train_test_split(X, y_seasonal, test_size=0.2, random_state=42)

# Initialize models
logreg_xyz = LogisticRegression(max_iter=1000, random_state=42)
logreg_seasonal = LogisticRegression(max_iter=1000, random_state=42)

# Fit models
logreg_xyz.fit(X_train, y_train_xyz)
logreg_seasonal.fit(X_train_seasonal, y_train_seasonal)

# Predict probabilities on validation set for evaluation
y_valid_xyz_pred = logreg_xyz.predict_proba(X_valid)[:, 1]
y_valid_seasonal_pred = logreg_seasonal.predict_proba(X_valid_seasonal)[:, 1]

# Evaluate the model
roc_auc_xyz = roc_auc_score(y_valid_xyz, y_valid_xyz_pred)
roc_auc_seasonal = roc_auc_score(y_valid_seasonal, y_valid_seasonal_pred)
print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Average ROC AUC: {(roc_auc_xyz + roc_auc_seasonal) / 2}')

# Predict probabilities on test set
preds_test_xyz = logreg_xyz.predict_proba(test_features)[:, 1]
preds_test_seasonal = logreg_seasonal.predict_proba(test_features)[:, 1]

# Prepare submission DataFrame
submission_df = pd.DataFrame({
    'respondent_id': test_respondent_ids,
    'h1n1_vaccine': preds_test_xyz,
    'seasonal_vaccine': preds_test_seasonal
})

# Save submission file
submission_df.to_csv('submission.csv', index=False)
print("Submission file saved as submission.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/content/training_set_features.csv'