In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score

# Load the datasets
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')

# Separate numerical and categorical columns
numerical_cols = train_features.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = train_features.select_dtypes(include=['object']).columns

# Define transformers for preprocessing
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Apply the preprocessing to the training and test data
train_features_processed = preprocessor.fit_transform(train_features)
test_features_processed = preprocessor.transform(test_features)

# Split the training data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(train_features_processed, train_labels, test_size=0.2, random_state=42)

# Define the model
model = MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42))

# Train the model
model.fit(X_train, y_train)

# Predict probabilities
y_pred_prob = model.predict_proba(X_valid)

# Calculate ROC AUC score for each target variable
roc_auc_xyz = roc_auc_score(y_valid['xyz_vaccine'], y_pred_prob[0][:, 1])
roc_auc_seasonal = roc_auc_score(y_valid['seasonal_vaccine'], y_pred_prob[1][:, 1])

# Calculate mean ROC AUC score
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f"ROC AUC for xyz_vaccine: {roc_auc_xyz}")
print(f"ROC AUC for seasonal_vaccine: {roc_auc_seasonal}")
print(f"Mean ROC AUC: {mean_roc_auc}")

# Generate predictions on the test set
test_pred_prob = model.predict_proba(test_features_processed)
test_pred_df = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_pred_prob[0][:, 1],
    'seasonal_vaccine': test_pred_prob[1][:, 1]
})

# Save the submission file
test_pred_df.to_csv('submission.csv', index=False)

print("Submission file created successfully!")


MemoryError: could not allocate 5600706560 bytes

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score

# Load the datasets
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')

# Separate numerical and categorical columns
numerical_cols = train_features.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = train_features.select_dtypes(include=['object']).columns

# Define transformers for preprocessing
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Apply the preprocessing to the training and test data
train_features_processed = preprocessor.fit_transform(train_features)
test_features_processed = preprocessor.transform(test_features)

# Reduce the dataset size to handle memory issues
train_features_sampled, _, train_labels_sampled, _ = train_test_split(
    train_features_processed, train_labels, train_size=0.1, random_state=42)

# Split the training data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(
    train_features_sampled, train_labels_sampled, test_size=0.2, random_state=42)

# Define the model
model = MultiOutputClassifier(LogisticRegression(max_iter=1000, random_state=42))

# Train the model
model.fit(X_train, y_train)

# Predict probabilities
y_pred_prob = model.predict_proba(X_valid)

# Calculate ROC AUC score for each target variable
roc_auc_xyz = roc_auc_score(y_valid['xyz_vaccine'], y_pred_prob[0][:, 1])
roc_auc_seasonal = roc_auc_score(y_valid['seasonal_vaccine'], y_pred_prob[1][:, 1])

# Calculate mean ROC AUC score
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f"ROC AUC for xyz_vaccine: {roc_auc_xyz}")
print(f"ROC AUC for seasonal_vaccine: {roc_auc_seasonal}")
print(f"Mean ROC AUC: {mean_roc_auc}")

# Generate predictions on the test set
test_pred_prob = model.predict_proba(test_features_processed)
test_pred_df = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': test_pred_prob[0][:, 1],
    'seasonal_vaccine': test_pred_prob[1][:, 1]
})

# Save the submission file
test_pred_df.to_csv('submission.csv', index=False)

print("Submission file created successfully!")


ROC AUC for xyz_vaccine: 0.40876906318082784
ROC AUC for seasonal_vaccine: 0.6967125283956968
Mean ROC AUC: 0.5527407957882623


MemoryError: Unable to allocate 435. MiB for an array with shape (26708, 2136) and data type float64

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score
import numpy as np

# Load the datasets
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')

# Separate numerical and categorical columns
numerical_cols = train_features.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = train_features.select_dtypes(include=['object']).columns

# Define transformers for preprocessing
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Apply the preprocessing to the training and test data
train_features_processed = preprocessor.fit_transform(train_features)
test_features_processed = preprocessor.transform(test_features)

# Reduce the dataset size to handle memory issues
train_features_sampled, _, train_labels_sampled, _ = train_test_split(
    train_features_processed, train_labels, train_size=0.1, random_state=42)

# Split the training data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(
    train_features_sampled, train_labels_sampled, test_size=0.2, random_state=42)

# Define the model
model = MultiOutputClassifier(LogisticRegression(max_iter=1000, random_state=42))

# Train the model
model.fit(X_train, y_train)

# Predict probabilities
y_pred_prob = model.predict_proba(X_valid)

# Calculate ROC AUC score for each target variable
roc_auc_xyz = roc_auc_score(y_valid['xyz_vaccine'], y_pred_prob[0][:, 1])
roc_auc_seasonal = roc_auc_score(y_valid['seasonal_vaccine'], y_pred_prob[1][:, 1])

# Calculate mean ROC AUC score
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f"ROC AUC for xyz_vaccine: {roc_auc_xyz}")
print(f"ROC AUC for seasonal_vaccine: {roc_auc_seasonal}")
print(f"Mean ROC AUC: {mean_roc_auc}")

# Generate predictions on the test set in chunks
chunk_size = 5000  # adjust chunk size based on your memory capacity
num_chunks = int(np.ceil(test_features_processed.shape[0] / chunk_size))

test_preds = []
for i in range(num_chunks):
    start_idx = i * chunk_size
    end_idx = min((i + 1) * chunk_size, test_features_processed.shape[0])
    test_chunk = test_features_processed[start_idx:end_idx]
    test_pred_prob_chunk = model.predict_proba(test_chunk)
    test_preds.append({
        'respondent_id': test_features.iloc[start_idx:end_idx]['respondent_id'],
        'xyz_vaccine': test_pred_prob_chunk[0][:, 1],
        'seasonal_vaccine': test_pred_prob_chunk[1][:, 1]
    })

test_pred_df = pd.concat([pd.DataFrame(chunk) for chunk in test_preds])

# Save the submission file
test_pred_df.to_csv('submission.csv', index=False)

print("Submission file created successfully!")


ROC AUC for xyz_vaccine: 0.40876906318082784
ROC AUC for seasonal_vaccine: 0.6967125283956968
Mean ROC AUC: 0.5527407957882623


MemoryError: Unable to allocate 81.5 MiB for an array with shape (5000, 2136) and data type float64

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score
import numpy as np

# Load the datasets
train_features = pd.read_csv('/mnt/data/training_set_features.csv')
train_labels = pd.read_csv('/mnt/data/training_set_labels.csv')
test_features = pd.read_csv('/mnt/data/test_set_features.csv')

# Separate numerical and categorical columns
numerical_cols = train_features.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = train_features.select_dtypes(include=['object']).columns

# Define transformers for preprocessing
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Apply the preprocessing to the training and test data
train_features_processed = preprocessor.fit_transform(train_features)
test_features_processed = preprocessor.transform(test_features)

# Reduce the dataset size to handle memory issues
train_features_sampled, _, train_labels_sampled, _ = train_test_split(
    train_features_processed, train_labels, train_size=0.1, random_state=42)

# Split the training data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(
    train_features_sampled, train_labels_sampled, test_size=0.2, random_state=42)

# Define the model
model = MultiOutputClassifier(LogisticRegression(max_iter=1000, random_state=42))

# Train the model
model.fit(X_train, y_train)

# Predict probabilities
y_pred_prob = model.predict_proba(X_valid)

# Calculate ROC AUC score for each target variable
roc_auc_xyz = roc_auc_score(y_valid['xyz_vaccine'], y_pred_prob[0][:, 1])
roc_auc_seasonal = roc_auc_score(y_valid['seasonal_vaccine'], y_pred_prob[1][:, 1])

# Calculate mean ROC AUC score
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f"ROC AUC for xyz_vaccine: {roc_auc_xyz}")
print(f"ROC AUC for seasonal_vaccine: {roc_auc_seasonal}")
print(f"Mean ROC AUC: {mean_roc_auc}")

# Generate predictions on the test set in smaller chunks
chunk_size = 10  # smaller chunk size
num_chunks = int(np.ceil(test_features_processed.shape[0] / chunk_size))

test_preds = []
for i in range(num_chunks):
    start_idx = i * chunk_size
    end_idx = min((i + 1) * chunk_size, test_features_processed.shape[0])
    test_chunk = test_features_processed[start_idx:end_idx]
    test_pred_prob_chunk = model.predict_proba(test_chunk)
    test_preds.append({
        'respondent_id': test_features.iloc[start_idx:end_idx]['respondent_id'],
        'xyz_vaccine': test_pred_prob_chunk[0][:, 1],
        'seasonal_vaccine': test_pred_prob_chunk[1][:, 1]
    })

test_pred_df = pd.concat([pd.DataFrame(chunk) for chunk in test_preds])

# Save the submission file
test_pred_df.to_csv('/mnt/data/submission.csv', index=False)

print("Submission file created successfully!")

FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/training_set_features.csv'

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score
import numpy as np

# Load the datasets
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')

# Separate numerical and categorical columns
numerical_cols = train_features.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = train_features.select_dtypes(include=['object']).columns

# Define transformers for preprocessing
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Apply the preprocessing to the training and test data
train_features_processed = preprocessor.fit_transform(train_features)
test_features_processed = preprocessor.transform(test_features)

# Reduce the dataset size to handle memory issues
train_features_sampled, _, train_labels_sampled, _ = train_test_split(
    train_features_processed, train_labels, train_size=0.1, random_state=42)

# Split the training data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(
    train_features_sampled, train_labels_sampled, test_size=0.2, random_state=42)

# Define the model
model = MultiOutputClassifier(LogisticRegression(max_iter=1000, random_state=42))

# Train the model
model.fit(X_train, y_train)

# Predict probabilities
y_pred_prob = model.predict_proba(X_valid)

# Calculate ROC AUC score for each target variable
roc_auc_xyz = roc_auc_score(y_valid['xyz_vaccine'], y_pred_prob[0][:, 1])
roc_auc_seasonal = roc_auc_score(y_valid['seasonal_vaccine'], y_pred_prob[1][:, 1])

# Calculate mean ROC AUC score
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f"ROC AUC for xyz_vaccine: {roc_auc_xyz}")
print(f"ROC AUC for seasonal_vaccine: {roc_auc_seasonal}")
print(f"Mean ROC AUC: {mean_roc_auc}")

# Generate predictions on the test set in smaller chunks
chunk_size = 10  # smaller chunk size
num_chunks = int(np.ceil(test_features_processed.shape[0] / chunk_size))

test_preds = []
for i in range(num_chunks):
    start_idx = i * chunk_size
    end_idx = min((i + 1) * chunk_size, test_features_processed.shape[0])
    test_chunk = test_features_processed[start_idx:end_idx]
    test_pred_prob_chunk = model.predict_proba(test_chunk)
    test_preds.append({
        'respondent_id': test_features.iloc[start_idx:end_idx]['respondent_id'],
        'xyz_vaccine': test_pred_prob_chunk[0][:, 1],
        'seasonal_vaccine': test_pred_prob_chunk[1][:, 1]
    })

test_pred_df = pd.concat([pd.DataFrame(chunk) for chunk in test_preds])

# Save the submission file
test_pred_df.to_csv('submission.csv', index=False)

print("Submission file created successfully!")

ROC AUC for xyz_vaccine: 0.40876906318082784
ROC AUC for seasonal_vaccine: 0.6967125283956968
Mean ROC AUC: 0.5527407957882623
Submission file created successfully!
