In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder

In [2]:
le = LabelEncoder()

In [3]:
# Load dataset
accounts_df = pd.read_csv("../data/features/account.csv", encoding="latin-1")
concerts_new_df = pd.read_csv("../data/features/concerts_2014-15.csv")
concerts_old_df = pd.read_csv("../data/features/concerts.csv")
subscriptions_df = pd.read_csv("../data/features/subscriptions.csv")
test_df = pd.read_csv("../data/features/test.csv")
tickets_df = pd.read_csv("../data/features/tickets_all.csv")
train_df = pd.read_csv("../data/features/train.csv")
zipcodes_df = pd.read_csv("../data/features/zipcodes.csv")

In [4]:
# Remove unnecessary columns and fill NaN values with 0
columns_to_drop = ["shipping.zip.code", "shipping.city", "relationship", "first.donated"]
accounts_df.drop(columns=columns_to_drop, inplace=True)
accounts_df.fillna(0, inplace=True)

# Calculate the number of subscriptions per account
subscriptions_df["num_subscriptions"] = subscriptions_df.groupby("account.id")["account.id"].transform("count")
subscriptions_df = subscriptions_df[["account.id", "package", "price.level", "subscription_tier", "num_subscriptions"]]

# Merge accounts and subscriptions data
final_df = accounts_df.merge(subscriptions_df, on="account.id", how="left")
final_df = final_df.drop_duplicates(subset=["account.id"])
final_df.fillna(0, inplace=True)

# Encode account IDs
final_df["account_encoded.id"] = le.fit_transform(final_df["account.id"])
final_train_df = train_df.merge(final_df, on="account.id", how="left")

# Prepare data for training and validation
y = final_train_df["label"]
X = final_train_df.drop(columns=["account.id", "label", "billing.zip.code", "billing.city", "package"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=0)

# Train a Random Forest Classifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict probabilities and calculate accuracy and AUC
probs = model.predict_proba(X_val)[:, 1]
acc = accuracy_score(y_val, model.predict(X_val))
auc = roc_auc_score(y_val, probs)

# Print results
print("ACC: " + str(acc))
print("AUC: " + str(auc))

ACC: 0.9817658349328215
AUC: 0.9724844599844601


In [5]:
# Merge test_df with final_df
final_test_df = test_df.merge(final_df, left_on="ID", right_on="account.id", how="left")

# Drop unnecessary columns
X_final_test = final_test_df.drop(columns=["ID", "account.id", "billing.zip.code", "billing.city", "package"])

# Predict using the model
test_probs = model.predict_proba(X_final_test)[:, 1]

# Create and display the submission dataframe
submission_df = pd.DataFrame({"ID": test_df["ID"], "Predicted": test_probs})
display(submission_df.head())

# Save submission to a CSV file
submission_df.to_csv("../submissions/submission.csv", index=False)

Unnamed: 0,ID,Predicted
0,001i000000NuQ6Y,0.02
1,001i000000NuQXz,0.19
2,001i000000NuRDC,0.0
3,001i000000NuQeg,0.2
4,001i000000NuOQc,0.01
