In [None]:
pip install catboost




In [None]:
import pandas as pd

def dateconvert(df,column,reference_date):
    # Convert 'date_column' to datetime
    df[column] = pd.to_datetime(df[column])

    df[f'{column}_year'] = df[column].dt.year
    df[f'{column}_month'] = df[column].dt.month
    df[f'{column}_weekday'] = df[column].dt.weekday
    df[f'{column}_year'] = df[f'{column}_year'].fillna('NO')
    df[f'{column}_month'] = df[f'{column}_month'].fillna('NO')
    df[f'{column}_weekday'] = df[f'{column}_weekday'].fillna('NO')

    # Specify a reference date
     #'2022-01-01'

    # Convert date to numeric value representing days since reference date
    df[column] = (pd.to_datetime(reference_date)-df[column]).dt.days
    if column == "EventBeginDate":
        df[column] = df[column].fillna(654)
    elif column == "EventEndDate":
        df[column] = df[column].fillna(654)
    elif column == "CustomerFirstWBBActionDate":
        df[column] = df[column].fillna(5023)
    elif column == "CustomerFirstWBBPurchaseDate":
        df[column] = df[column].fillna(4842)
    elif column == "CustomerLastWBBActionDate":
        df[column] = df[column].fillna(4807)
    elif column == "CustomerLastWBBPurchaseDate":
        df[column] = df[column].fillna(4729)

    return df

def feature_processing_train(df):
    mapping = {'Multiple Activities':0, 'Other Secondary Activity':1, 'Primary Purchase':2, 'Secondary Purchase':3, 'Transfer Recipient': 4}
    df['ActivityType'] = df['ActivityType'].map(mapping)
    # Define the mapping dictionary
    df['SameState'] = df.apply(lambda row: 1 if pd.notnull(row['CustomerState']) and pd.notnull(row['FacilityState']) and row['CustomerState'].lower() == row['FacilityState'].lower() else 0, axis=1)
    df['SameCity'] = df.apply(lambda row: 1 if pd.notnull(row['CustomerCity']) and pd.notnull(row['FacilityCity']) and row['CustomerCity'].lower() == row['FacilityCity'].lower() else 0, axis=1)

    return df

def feature_processing_test(df):
    df['SameState'] = df.apply(lambda row: 1 if pd.notnull(row['CustomerState']) and pd.notnull(row['FacilityState']) and row['CustomerState'].lower() == row['FacilityState'].lower() else 0, axis=1)
    df['SameCity'] = df.apply(lambda row: 1 if pd.notnull(row['CustomerCity']) and pd.notnull(row['FacilityCity']) and row['CustomerCity'].lower() == row['FacilityCity'].lower() else 0, axis=1)

    return df

def getdummy(train_df, test_df, category_variable, suffix='_dummy'):
    import pandas as pd
    # Convert training data to dummy variables
    train_dummy_df = pd.get_dummies(train_df[category_variable]).astype(int).add_suffix(suffix)
    # Convert test data to dummy variables
    test_dummy_df = pd.get_dummies(test_df[category_variable]).astype(int).add_suffix(suffix)
    # Ensure both train and test datasets have the same dummy variable columns
    # Add missing dummy variable columns to test data, filled with zeros
    missing_cols = set(train_dummy_df.columns) - set(test_dummy_df.columns)
    for col in missing_cols:
        test_dummy_df[col] = 0
    # Reorder test data columns to match train data columns
    test_dummy_df = test_dummy_df[train_dummy_df.columns]
    # Concatenate dummy variables with original test DataFrame
    train_df = pd.concat([train_df, train_dummy_df], axis=1)
    # train_df.drop(columns=[category_variable], inplace=True)
    test_df = pd.concat([test_df, test_dummy_df], axis=1)
    # test_df.drop(columns=[category_variable], inplace=True)

    train_df.columns = train_df.columns.astype(str)
    test_df.columns = test_df.columns.astype(str)
    # if '(UNK)' in train_df.columns:
    #     train_df.drop(columns=['(UNK)'], inplace=True)
    #     test_df.drop(columns=['(UNK)'], inplace=True)
    #     train_dummy_df.drop(columns=['(UNK)'], inplace=True)

    return train_df, test_df, train_dummy_df.columns

def datedummy(df):
    df["EventBeginDate_dummy"] = (df["EventBeginDate"] < 654).astype(int)
    df["EventEndDate_dummy"] = (df["EventEndDate"] < 654).astype(int)
    df["CustomerFirstWBBActionDate_dummy"] = (df["CustomerFirstWBBActionDate"]< 5023).astype(int)
    df["CustomerFirstWBBPurchaseDate_dummy"] = (df["CustomerFirstWBBPurchaseDate"] < 4842).astype(int)
    df["CustomerLastWBBActionDate_dummy"] = (df["CustomerLastWBBActionDate"] < 4807).astype(int)
    df["CustomerLastWBBPurchaseDate_dummy"] = (df["CustomerLastWBBPurchaseDate"] < 4729).astype(int)
    return df



In [None]:
# processing for
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
label_encoder = LabelEncoder()
from datetime import datetime

df = pd.read_csv("DIWBB_Training.csv")
df_test = pd.read_csv("DIWBB_Test.csv")
submission = pd.read_csv("DIWBB_Submission_Template.csv")

#EventRoundName = null -> No activity
df = df[~df['EventRoundName'].isnull()]
df_test = df_test[~df_test['EventRoundName'].isnull()]

#EventRoundName = First Round or First and Second Rounds -> 'Primary Purchase'
df_fs = df_test[(df_test['EventRoundName'] == 'First Round') | (df_test['EventRoundName'] == 'First and Second Rounds')]['RecordID']
submission.loc[submission['RecordID'].isin(df_fs), 'ActivityType'] = 'Primary Purchase'
df = df[~((df['EventRoundName'] == 'First Round') | (df['EventRoundName'] == 'First and Second Rounds'))]
df_test = df_test[~((df_test['EventRoundName'] == 'First Round') | (df_test['EventRoundName'] == 'First and Second Rounds'))]
# del df_fs

#EventRoundName = Regionals and EventSession = All-Session -> 'Primary Purchase'
df_ra = df_test[(df_test['EventRoundName'] == 'Regionals') & (df_test['EventSession'] == 'All-Session')]['RecordID']
submission.loc[submission['RecordID'].isin(df_ra), 'ActivityType'] = 'Primary Purchase'
df = df[~((df['EventRoundName'] == 'Regionals') & (df['EventSession'] == 'All-Session'))]
df_test = df_test[~((df_test['EventRoundName'] == 'Regionals') & (df_test['EventSession'] == 'All-Session'))]
# del df_ra

#ChampionshipYear=2022 and EventRoundName = Regionals
df_22r = df_test[(df_test['EventRoundName'] == 'Regionals') & (df_test['ChampionshipYear'] == 2022)]['RecordID']
submission.loc[submission['RecordID'].isin(df_22r), 'ActivityType'] = 'Primary Purchase'
df = df[~((df['EventRoundName'] == 'Regionals') & (df['ChampionshipYear'] == 2022))]
df_test = df_test[~((df_test['EventRoundName'] == 'Regionals') & (df_test['ChampionshipYear'] == 2022))]
# del df_22r

#ChampionshipYear=2023 and EventRoundName = Regionals and EventBeginDate = EventEndDate and session!= 4
df_23r = df_test[(df_test['EventRoundName'] == 'Regionals') & (df_test['ChampionshipYear'] == 2023) & (df_test['EventBeginDate'] == df_test['EventEndDate']) & (df_test['EventSession'] != 'Session 4')]['RecordID']
submission.loc[submission['RecordID'].isin(df_23r), 'ActivityType'] = 'Primary Purchase'
df = df[~((df['EventRoundName'] == 'Regionals') & (df['ChampionshipYear'] == 2023) & (df['EventBeginDate'] == df['EventEndDate']) & (df['EventSession'] != 'Session 4'))]
df_test = df_test[~((df_test['EventRoundName'] == 'Regionals') & (df_test['ChampionshipYear'] == 2023) & (df_test['EventBeginDate'] == df_test['EventEndDate']) & (df_test['EventSession'] != 'Session 4'))]
# del df_23r


  df = pd.read_csv("DIWBB_Training.csv")


In [None]:
#Model
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
label_encoder = LabelEncoder()
from datetime import datetime
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
from catboost import CatBoostClassifier  # Import CatBoost
label_encoder = LabelEncoder()
from datetime import datetime
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

df = feature_processing_train(df)
df_test = feature_processing_test(df_test)

df = dateconvert(df,'CustomerLastWBBPurchaseDate','2023-12-31')
df_test = dateconvert(df_test,'CustomerLastWBBPurchaseDate','2023-12-31')

df = dateconvert(df,'CustomerLastWBBActionDate','2023-12-31')
df_test = dateconvert(df_test,'CustomerLastWBBActionDate','2023-12-31')

df = dateconvert(df,'CustomerFirstWBBActionDate','2023-12-31')
df_test = dateconvert(df_test,'CustomerFirstWBBActionDate','2023-12-31')

df = dateconvert(df,'CustomerFirstWBBPurchaseDate','2023-12-31')
df_test = dateconvert(df_test,'CustomerFirstWBBPurchaseDate','2023-12-31')

df = dateconvert(df,'EventBeginDate','2023-12-31')
df_test = dateconvert(df_test,'EventBeginDate','2023-12-31')

df = dateconvert(df,'EventEndDate','2023-12-31')
df_test = dateconvert(df_test,'EventEndDate','2023-12-31')

# date_dummy = ["EventBeginDate_dummy","EventEndDate_dummy","CustomerFirstWBBActionDate_dummy","CustomerFirstWBBPurchaseDate_dummy","CustomerLastWBBActionDate_dummy","CustomerLastWBBPurchaseDate_dummy"]
date_variables = ['EventBeginDate','EventEndDate','CustomerFirstWBBActionDate','CustomerFirstWBBPurchaseDate','CustomerLastWBBActionDate','CustomerLastWBBPurchaseDate']
date_variables_year = ['EventBeginDate_year','EventEndDate_year','CustomerFirstWBBActionDate_year','CustomerFirstWBBPurchaseDate_year','CustomerLastWBBActionDate_year','CustomerLastWBBPurchaseDate_year']
date_variables_month = ['EventBeginDate_month','EventEndDate_month','CustomerFirstWBBActionDate_month','CustomerFirstWBBPurchaseDate_month','CustomerLastWBBActionDate_month','CustomerLastWBBPurchaseDate_month']
date_variables_weekday = ['EventBeginDate_weekday','EventEndDate_weekday','CustomerFirstWBBActionDate_weekday','CustomerFirstWBBPurchaseDate_weekday','CustomerLastWBBActionDate_weekday','CustomerLastWBBPurchaseDate_weekday']

dummy_date_variables_year = []
for i in range(len(date_variables_year)):
    df, df_test, dummy = getdummy(df, df_test, date_variables_year[i], suffix=f'_{date_variables_year[i]}')
    dummy_date_variables_year = dummy_date_variables_year + list(dummy)

dummy_date_variables_month = []
for i in range(len(date_variables_month)):
    df, df_test, dummy = getdummy(df, df_test, date_variables_month[i], suffix=f'_{date_variables_month[i]}')
    dummy_date_variables_month = dummy_date_variables_month + list(dummy)

dummy_date_variables_weekday = []
for i in range(len(date_variables_weekday)):
    df, df_test, dummy = getdummy(df, df_test, date_variables_weekday[i], suffix=f'_{date_variables_weekday[i]}')
    dummy_date_variables_weekday = dummy_date_variables_weekday + list(dummy)

df, df_test, dummy_EventRoundName = getdummy(df, df_test, "EventRoundName", suffix='_EventRoundName')
df, df_test, dummy_EventSession = getdummy(df, df_test, "EventSession", suffix='_EventSession')
df, df_test, dummy_FacilityDescription = getdummy(df, df_test, "FacilityDescription", suffix='_FacilityDescription')
df, df_test, dummy_CustomerState = getdummy(df, df_test, "CustomerState", suffix='_CustomerState')
df, df_test, dummy_FacilityState = getdummy(df, df_test, "FacilityState", suffix='_FacilityState')
df, df_test, dummy_HasCustomerClickedOrOpenedEmailsSixMonthsPrior = getdummy(df, df_test, "HasCustomerClickedOrOpenedEmailsSixMonthsPrior", suffix='_HasCustomerClickedOrOpenedEmailsSixMonthsPrior')
df, df_test, dummy_HostingInstitution = getdummy(df, df_test, "HostingInstitution", suffix='_HostingInstitution')
df, df_test, dummy_IsCustomerInNCAAMembership = getdummy(df, df_test, 'IsCustomerInNCAAMembership', suffix='_IsCustomerInNCAAMembership')
df, df_test, dummy_FacilityName = getdummy(df, df_test, 'FacilityName', suffix='_FacilityName')
df, df_test, dummy_IsEventFinalSite = getdummy(df, df_test, 'IsEventFinalSite', suffix='_IsEventFinalSite')
# df, df_test, dummy_FacilityCity = getdummy(df, df_test, 'FacilityCity', suffix='_FacilityCity')
# df, df_test, dummy_FacilityZipCode = getdummy(df, df_test, 'FacilityZipCode', suffix='_FacilityZipCode')


# date_dummy +
features = date_variables + dummy_date_variables_year + dummy_date_variables_month + dummy_date_variables_weekday + \
     list(dummy_FacilityState) + list(dummy_CustomerState) + list(dummy_FacilityDescription) + \
     list(dummy_EventSession) + list(dummy_EventRoundName) + \
    list(dummy_HostingInstitution) + list(dummy_HasCustomerClickedOrOpenedEmailsSixMonthsPrior) + \
    list(dummy_IsCustomerInNCAAMembership) + list(dummy_IsEventFinalSite) + \
    list(dummy_FacilityName) + \
    ['SameState', 'SameCity']

y = df['ActivityType']
num_classes = len(df['ActivityType'].unique())
X_2022 = df[df['ChampionshipYear'] == 2022][features]
X_2023 = df[df['ChampionshipYear'] == 2023][features]
y_2022 = df[df['ChampionshipYear'] == 2022]['ActivityType']
y_2023 = df[df['ChampionshipYear'] == 2023]['ActivityType']

Xys = [[X_2022,y_2022],[X_2023,y_2023]]
acc = []
#['Multiple Activities' 'No Activity' 'Other Secondary Activity','Primary Purchase' 'Secondary Purchase' 'Transfer Recipient']

results = pd.DataFrame(columns=['RecordID', 'Result'])

for index, Xy in enumerate(Xys):

# Split data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(Xy[0], Xy[1], test_size=0.2, random_state=42)


    if index == 0:
        model = xgb.XGBClassifier(objective='multi:softmax',
                                  num_class=num_classes,
                                  reg_alpha=0,  # Adjust this value for L1 regularization
                                  reg_lambda=0,  # Adjust this value for L2 regularization
                                  gamma=0,  # Adjust this value for complexity control
                                  max_depth=50,  # Adjust this value for tree depth regularization
                                  min_child_weight=1)  # Adjust this value for regularization
        model.fit(X_train, y_train)

    elif index == 1:
        # Use CatBoost model
        model = CatBoostClassifier(loss_function='MultiClass',
                                   n_estimators=800,
                                   depth=16,
                                   learning_rate=0.01,
                                   verbose=100)  # Adjust hyperparameters as needed
        model.fit(X_train, y_train)


    y_train_predict = model.predict(X_train)
    accuracy_train = accuracy_score(y_train, y_train_predict)
    print("Train_Accuracy:", accuracy_train)
    # Make predictions
    y_pred = model.predict(X_val)
    # Evaluate model
    accuracy = accuracy_score(y_val, y_pred)
    print("Validation_Accuracy:", accuracy)
    acc.append(accuracy)

    y_test = model.predict(df_test[df_test['ChampionshipYear'] == index+2022][features])
    mapping = {0:'Multiple Activities', 1:'Other Secondary Activity', 2:'Primary Purchase', 3:'Secondary Purchase', 4:'Transfer Recipient'}
    y_test = np.array([mapping[val] for val in y_test])

    id_df = df_test[df_test['ChampionshipYear'] == index+2022]['RecordID']
    id_df.reset_index(drop=True, inplace=True)
    result_df = pd.DataFrame(y_test, columns=['Result'])
    result_df.reset_index(drop=True, inplace=True)
    result = pd.concat([id_df, result_df],axis=1)
    results = pd.concat([results,result],axis=0)

submission = pd.merge(submission, results, how='left', on='RecordID')
submission.loc[~submission['Result'].isna(),'ActivityType'] = submission.loc[~submission['Result'].isna(),'Result']
submission.drop(columns=['Result'], inplace=True)
current_time = datetime.now()
formatted_date = current_time.strftime("%Y%m%d")
formatted_time = current_time.strftime("%H%M%S")
submission.to_csv(f"submission_LastDance_{formatted_date}_{formatted_time}_{acc[0]:.4f}_{acc[1]:.4f}.csv", index=False)

Train_Accuracy: 0.9620689655172414
Validation_Accuracy: 0.8264462809917356
0:	learn: 1.6000401	total: 2.05ms	remaining: 1.64s
100:	learn: 1.2280985	total: 2.4s	remaining: 16.6s
200:	learn: 1.1433883	total: 32.1s	remaining: 1m 36s
300:	learn: 1.1130135	total: 1m 4s	remaining: 1m 47s
400:	learn: 1.0975902	total: 1m 19s	remaining: 1m 19s
500:	learn: 1.0880552	total: 1m 40s	remaining: 1m
600:	learn: 1.0807901	total: 2m 4s	remaining: 41.2s
700:	learn: 1.0734523	total: 2m 39s	remaining: 22.6s
799:	learn: 1.0652902	total: 4m 8s	remaining: 0us
Train_Accuracy: 0.5635057471264368
Validation_Accuracy: 0.5579793340987371


TypeError: unhashable type: 'numpy.ndarray'

In [None]:
# Model
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
from catboost import CatBoostClassifier  # Import CatBoost
label_encoder = LabelEncoder()
from datetime import datetime
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Assume feature_processing_train, feature_processing_test, dateconvert, getdummy functions are defined above

# Feature processing and dummy variable creation assumed to be defined above

# Define features and labels
# date_dummy +
features = date_variables + dummy_date_variables_year + dummy_date_variables_month + dummy_date_variables_weekday + \
     list(dummy_FacilityState) + list(dummy_CustomerState) + list(dummy_FacilityDescription) + \
     list(dummy_EventSession) + list(dummy_EventRoundName) + \
    list(dummy_HostingInstitution) + list(dummy_HasCustomerClickedOrOpenedEmailsSixMonthsPrior) + \
    list(dummy_IsCustomerInNCAAMembership) + list(dummy_IsEventFinalSite) + \
    list(dummy_FacilityName) + \
    ['SameState', 'SameCity']

y = df['ActivityType']
num_classes = len(df['ActivityType'].unique())
X_2022 = df[df['ChampionshipYear'] == 2022][features]
X_2023 = df[df['ChampionshipYear'] == 2023][features]
y_2022 = df[df['ChampionshipYear'] == 2022]['ActivityType']
y_2023 = df[df['ChampionshipYear'] == 2023]['ActivityType']

Xys = [[X_2022, y_2022], [X_2023, y_2023]]
acc = []

results = pd.DataFrame(columns=['RecordID', 'Result'])

for index, Xy in enumerate(Xys):
    # Split data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(Xy[0], Xy[1], test_size=0.2, random_state=42)

    if index == 0:
        model = xgb.XGBClassifier(objective='multi:softmax',
                                  num_class=num_classes,
                                  reg_alpha=0,  # Adjust this value for L1 regularization
                                  reg_lambda=0,  # Adjust this value for L2 regularization
                                  gamma=0,  # Adjust this value for complexity control
                                  max_depth=50,  # Adjust this value for tree depth regularization
                                  min_child_weight=1)  # Adjust this value for regularization
        model.fit(X_train, y_train)

    elif index == 1:
        # Use CatBoost model
        model = CatBoostClassifier(loss_function='MultiClass',
                                   n_estimators=1500,
                                   depth=10,
                                   learning_rate=0.1,
                                   verbose=100)  # Adjust hyperparameters as needed
        model.fit(X_train, y_train)

    y_train_predict = model.predict(X_train)
    accuracy_train = accuracy_score(y_train, y_train_predict)
    print("Train_Accuracy:", accuracy_train)

    # Make predictions
    y_pred = model.predict(X_val)
    # Evaluate model
    accuracy = accuracy_score(y_val, y_pred)
    print("Validation_Accuracy:", accuracy)
    acc.append(accuracy)

    # Test set predictions and results compilation assumed to be defined here

submission = pd.merge(submission, results, how='left', on='RecordID')
submission.loc[~submission['Result'].isna(), 'ActivityType'] = submission.loc[~submission['Result'].isna(), 'Result']
submission.drop(columns=['Result'], inplace=True)
current_time = datetime.now()
formatted_date = current_time.strftime("%Y%m%d")
formatted_time = current_time.strftime("%H%M%S")
submission.to_csv(f"submission_LastDance_{formatted_date}_{formatted_time}_{acc[0]:.4f}_{acc[1]:.4f}.csv", index=False)




In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# Predict probabilities
y_prob = model.predict_proba(X_val)

# Compute AUC for each class
auc_scores = []
for i in range(num_classes):
    auc_score = roc_auc_score((y_val == i).astype(int), y_prob[:, i])
    auc_scores.append(auc_score)
    print(f"AUC for class {i}: {auc_score}")

# Plot ROC curve
plt.figure(figsize=(8, 6))
for i in range(num_classes):
    fpr, tpr, _ = roc_curve((y_val == i).astype(int), y_prob[:, i])
    plt.plot(fpr, tpr, label=f'Class {i} (AUC = {auc_scores[i]:.2f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Get feature importances
importances = model.feature_importances_

# Get feature names
feature_names = X_train.columns

# Sort feature importances in descending order
indices = np.argsort(importances)[::-1]



# Rearrange feature names based on importance rank
sorted_feature_names = [feature_names[i] for i in indices]

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.title("Feature Importances")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), sorted_feature_names, rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.tight_layout()
plt.show()

for i in range(len(sorted_feature_names)):
    print(sorted_feature_names[i])
    print(importances[indices][i])