In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

# Load the datasets
train_df = pd.read_csv("participants_training_dataset.csv")
test_df = pd.read_csv("participants_test_dataset.csv")

# Drop 'User_Key' column and separate target variable
X = train_df.drop(["User_Key", "Next_Purchase"], axis=1)
y = train_df["Next_Purchase"]
test_user_keys = test_df["User_Key"]
X_test = test_df.drop(["User_Key"], axis=1)

# Handle missing numerical values with mean imputation
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
imputer = SimpleImputer(strategy="mean")
X[num_cols] = imputer.fit_transform(X[num_cols])
X_test[num_cols] = imputer.transform(X_test[num_cols])

# Handle categorical variables with Label Encoding
for col in X.select_dtypes(include=['object']).columns:
    X[col] = X[col].astype(str).fillna("Unknown")
    X_test[col] = X_test[col].astype(str).fillna("Unknown")
    
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])  # Fit on train data
    
    # Ensure test set doesn't break due to unseen categories
    le_classes = np.append(le.classes_, "Unknown")  # Add unknown category
    le.classes_ = le_classes
    X_test[col] = X_test[col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Validate the model
y_pred = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_pred))

# Predict on test dataset
y_test_pred = model.predict(X_test)

# Create Submission File
#submission = pd.DataFrame({"User_Key": test_user_keys, "Next_Purchase": y_test_pred})
#submission.to_csv("submission.csv", index=False)
#print("Submission file saved as 'submission.csv'")


Validation Accuracy: 0.8737623762376238
