Team Gordon

Student Name	Student Number
 Alisha Sahota	20497348
 Anthony Ramelo	20499391
 Chris Wu	10182394
 Elizabeth Zhang	20161231
 Emily Zhao	10096273
 Sam Hossain	20466500


In [None]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Set plot style
sns.set(style='whitegrid')

# Constants
OUTPUT_DIR = 'output_engagement'
os.makedirs(OUTPUT_DIR, exist_ok=True)
CURRENT_YEAR = datetime.now().year

# Functions

def load_data(file_path, sheet_name):
    """
    Load data from an Excel file and strip whitespace from column names.
    """
    df = pd.read_excel(file_path, sheet_name=sheet_name)
    df.columns = df.columns.str.strip()
    return df

def ensure_column_exists(df, column_name, alternative_names=None):
    """
    Ensure a specific column exists in the DataFrame, possibly under alternative names.
    """
    if column_name in df.columns:
        return column_name
    if alternative_names:
        for alt_name in alternative_names:
            if alt_name in df.columns:
                df.rename(columns={alt_name: column_name}, inplace=True)
                return column_name
    raise ValueError(f"Column '{column_name}' or alternatives {alternative_names} not found.")

def create_engagement_features(df):
    """
    Create engagement-related features for analysis.
    """
    # Avoid division by zero
    df['Average total activities per month'].replace(0, np.nan, inplace=True)

    # Engagement Metrics
    df['Consistency_Score'] = df['Average activities per day'] / df['Average total activities per month']
    df['Engagement_Regularity'] = df['Consistency_Score'] * df['Average activities per day']
    df['Activity_Diversity'] = df[['Quiz Count', 'Mood Count', 'Inspiration Count']].gt(0).sum(axis=1)
    
    # Handle infinite and missing values
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0, inplace=True)
    return df

def derive_engagement_level(df):
    """
    Derive 'Engagement_Level' feature.
    """
    if 'Consistency_Score' in df.columns:
        median_consistency = df['Consistency_Score'].median()
        df['Engagement_Level'] = (df['Consistency_Score'] >= median_consistency).astype(int)
    else:
        print("Warning: 'Consistency_Score' not found. Cannot derive 'Engagement_Level'.")
    return df

def clean_data(df):
    """
    Cleans the dataset by replacing infinities and NaN values with the median.
    """
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    return df.fillna(df.median())

def create_model_dataset(df):
    """
    Create dataset for engagement prediction.
    """
    # Engagement features
    engagement_features = [
        'Mood Count',
        'Consistency_Score',
        'Engagement_Regularity',
        'Activity_Diversity',
        'Total Activities'
    ]
    engagement_features = [feat for feat in engagement_features if feat in df.columns]

    if 'Engagement_Level' in df.columns:
        # Include 'ID' in the dataset
        engagement_data = df[['ID'] + engagement_features + ['Engagement_Level']]
        output_path = os.path.join(OUTPUT_DIR, 'engagement_model_data.xlsx')
        engagement_data.to_excel(output_path, index=False)
        print(f"Engagement model data saved to '{output_path}'.")
    else:
        print("Warning: 'Engagement_Level' not found. Engagement model data not created.")

def train_engagement_model():
    """
    Train models to predict engagement levels and evaluate their performance.
    """
    # Load the data
    data_path = os.path.join(OUTPUT_DIR, 'engagement_model_data.xlsx')
    data = pd.read_excel(data_path)

    # Define features and target
    engagement_features = [
        'Mood Count',
        'Engagement_Regularity',
        'Activity_Diversity',
        'Total Activities'
    ]
    X = data[engagement_features]
    y = data['Engagement_Level']

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    # Clean data
    X_train = clean_data(X_train)
    X_test = clean_data(X_test)

    # Feature Scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train Logistic Regression
    print("\nTraining Logistic Regression for Engagement...")
    lr = LogisticRegression(random_state=42, max_iter=1000)
    lr.fit(X_train_scaled, y_train)
    y_pred_lr = lr.predict(X_test_scaled)
    print("\nLogistic Regression Report:")
    print(classification_report(y_test, y_pred_lr))
    print(f"AUC (Logistic Regression): {roc_auc_score(y_test, lr.predict_proba(X_test_scaled)[:, 1]):.2f}")

    # Train Random Forest
    print("\nTraining Random Forest for Engagement...")
    rf = RandomForestClassifier(max_depth=5, n_estimators=100, random_state=42)
    rf.fit(X_train_scaled, y_train)
    y_pred_rf = rf.predict(X_test_scaled)
    print("\nRandom Forest Report:")
    print(classification_report(y_test, y_pred_rf))
    print(f"AUC (Random Forest): {roc_auc_score(y_test, rf.predict_proba(X_test_scaled)[:, 1]):.2f}")

    # Train XGBoost
    print("\nTraining XGBoost for Engagement...")
    xgb = XGBClassifier(learning_rate=0.01, max_depth=3, n_estimators=100, random_state=42, eval_metric='logloss')
    xgb.fit(X_train_scaled, y_train)
    y_pred_xgb = xgb.predict(X_test_scaled)
    print("\nXGBoost Report:")
    print(classification_report(y_test, y_pred_xgb))
    print(f"AUC (XGBoost): {roc_auc_score(y_test, xgb.predict_proba(X_test_scaled)[:, 1]):.2f}")

    # Feature importance for Random Forest
    feature_importances = pd.Series(rf.feature_importances_, index=engagement_features).sort_values(ascending=False)
    plt.figure(figsize=(10, 6))
    feature_importances.plot(kind='bar', color='skyblue')
    plt.title('Feature Importances (Random Forest) - Engagement')
    plt.ylabel('Importance')
    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, 'engagement_feature_importances.png'))
    plt.close()

    # Save predictions for analysis
    results = pd.DataFrame({
        'Actual': y_test,
        'Logistic Regression': y_pred_lr,
        'Random Forest': y_pred_rf,
        'XGBoost': y_pred_xgb
    })
    predictions_output_path = os.path.join(OUTPUT_DIR, 'engagement_predictions.xlsx')
    results.to_excel(predictions_output_path, index=False)
    print(f"Engagement predictions saved to '{predictions_output_path}'.")

# Main Execution

def main():
    file_path = 'Input/Data 3 - October, 2024.xlsx'  # Update with your data file path
    sheet_name = 'Parachute - Cross Section'  # Update with your sheet name

    # Load data
    df = load_data(file_path, sheet_name)

    # Ensure 'Total Activities' column exists
    total_activities_col = ensure_column_exists(df, 'Total Activities', alternative_names=['Total Activities '])

    # Create engagement features
    df = create_engagement_features(df)

    # Derive engagement level
    df = derive_engagement_level(df)

    # Save processed data
    processed_output_path = os.path.join(OUTPUT_DIR, 'processed_engagement_data.xlsx')
    df.to_excel(processed_output_path, index=False)
    print(f"Processed engagement data saved to '{processed_output_path}'.")

    # Create dataset for model
    create_model_dataset(df)

    # Train and evaluate the engagement model
    train_engagement_model()

if __name__ == '__main__':
    main()

Processed engagement data saved to 'output_engagement/processed_engagement_data.xlsx'.
Engagement model data saved to 'output_engagement/engagement_model_data.xlsx'.

Training Logistic Regression for Engagement...

Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.82      0.84      0.83        32
           1       0.84      0.81      0.83        32

    accuracy                           0.83        64
   macro avg       0.83      0.83      0.83        64
weighted avg       0.83      0.83      0.83        64

AUC (Logistic Regression): 0.92

Training Random Forest for Engagement...

Random Forest Report:
              precision    recall  f1-score   support

           0       0.96      0.75      0.84        32
           1       0.79      0.97      0.87        32

    accuracy                           0.86        64
   macro avg       0.88      0.86      0.86        64
weighted avg       0.88      0.86      0.86        64

AUC (R