In [35]:
# STEP 2: MODEL TRAINING FOR ALL PREDICTIONS
# =====================================================

import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_squared_error, classification_report, accuracy_score
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler



In [36]:
# Load Cleaned Dataset
# -----------------------------
df = pd.read_csv("Cleaned_College_Event_Dataset.csv")
df

Unnamed: 0,EventID,EventName,EventType,Department,DayOfWeek,Season,TimeOfDay,VenueCapacity,Attendance,AttendanceLevel,FeedbackScore,EngagementIndex
0,441,Event_441,Workshop,Management,Monday,Monsoon,Afternoon,460,362,High,4.0,3.069565
1,521,Event_521,Seminar,ECE,Monday,Summer,Evening,370,138,Low,4.0,1.143243
2,301,Event_301,Hackathon,Civil,Saturday,Summer,Morning,329,123,Low,5.0,1.914894
3,264,Event_264,Workshop,Management,Monday,Winter,Evening,286,68,Low,4.0,0.951049
4,201,Event_201,Cultural,Mechanical,Friday,Winter,Evening,450,286,Medium,2.0,1.886667
...,...,...,...,...,...,...,...,...,...,...,...,...
6283,640,Event_640,Hackathon,Management,Thursday,Monsoon,Afternoon,448,268,Medium,5.0,2.482143
6284,41,Event_41,Seminar,ECE,Friday,Monsoon,Morning,190,48,Low,5.0,0.884211
6285,319,Event_319,Cultural,ECE,Thursday,Monsoon,Morning,458,312,Medium,5.0,3.427948
6286,449,Event_449,Workshop,ECE,Friday,Summer,Evening,439,416,High,1.0,1.867882


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6288 entries, 0 to 6287
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   EventID          6288 non-null   int64  
 1   EventName        6288 non-null   object 
 2   EventType        6288 non-null   object 
 3   Department       6288 non-null   object 
 4   DayOfWeek        6288 non-null   object 
 5   Season           6288 non-null   object 
 6   TimeOfDay        6288 non-null   object 
 7   VenueCapacity    6288 non-null   int64  
 8   Attendance       6288 non-null   int64  
 9   AttendanceLevel  6288 non-null   object 
 10  FeedbackScore    6288 non-null   float64
 11  EngagementIndex  6288 non-null   float64
dtypes: float64(2), int64(3), object(7)
memory usage: 589.6+ KB


In [38]:
# # Encode categorical variables
# categorical_cols = ['EventType', 'Department', 'DayOfWeek', 'Season', 'TimeOfDay']
# encoders = {}

# for col in categorical_cols:
#     le = LabelEncoder()
#     df[col] = le.fit_transform(df[col])
#     encoders[col] = le

# # Save encoders for Streamlit usage
# with open("encoders.pkl", "wb") as f:
#     pickle.dump(encoders, f)

In [47]:
# attendace forcasting

# Features & Target
X = df[['EventType', 'DayOfWeek', 'Season', 'TimeOfDay', 'VenueCapacity']]
y = df['Attendance']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing
categorical = ['EventType', 'DayOfWeek', 'Season', 'TimeOfDay']
numeric = ['VenueCapacity']

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
    ('num', StandardScaler(), numeric)
])

# Pipeline
attendance_forecast_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

attendance_forecast_model.fit(X_train, y_train)

# Save
with open("attendance_forecast_model.pkl", "wb") as f:
    pickle.dump(attendance_forecast_model, f)

In [49]:
from sklearn.ensemble import RandomForestClassifier

#attendance_level_model 
X = df[['EventType', 'DayOfWeek', 'Season', 'TimeOfDay', 'VenueCapacity']]
y = df['AttendanceLevel']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['EventType', 'DayOfWeek', 'Season', 'TimeOfDay']),
    ('num', StandardScaler(), ['VenueCapacity'])
])

attendance_level_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

attendance_level_model.fit(X_train, y_train)

with open("attendance_level_model.pkl", "wb") as f:
    pickle.dump(attendance_level_model, f)


In [48]:
# target department
X = df[['EventType', 'DayOfWeek', 'Season', 'TimeOfDay']]
y = df['Department']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['EventType', 'DayOfWeek', 'Season', 'TimeOfDay'])
])

department_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

department_model.fit(X_train, y_train)

with open("department_model.pkl", "wb") as f:
    pickle.dump(department_model, f)


In [43]:
# 
# 5. Event Popularity Trend (Classification: Season/Month)
# Features & target
X = df[['Season', 'EventType', 'VenueCapacity']]
y = df['Attendance']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing
categorical = ['Season', 'EventType']
numeric = ['VenueCapacity']

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
    ('num', StandardScaler(), numeric)
])

# Pipeline with preprocessing + model
popularity_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Fit model
popularity_model.fit(X_train, y_train)

# Evaluate
rmse = mean_squared_error(y_test, popularity_model.predict(X_test))
print("Event Popularity RMSE:", rmse)

# Save model with preprocessing inside
with open("event_popularity.pkl", "wb") as f:
    pickle.dump(popularity_model, f)

Event Popularity RMSE: 5165.168490537492


In [44]:
df.columns

Index(['EventID', 'EventName', 'EventType', 'Department', 'DayOfWeek',
       'Season', 'TimeOfDay', 'VenueCapacity', 'Attendance', 'AttendanceLevel',
       'FeedbackScore', 'EngagementIndex'],
      dtype='object')

In [50]:
# engagement_model
X = df[['EventType', 'DayOfWeek', 'Season', 'TimeOfDay', 'Attendance', 'FeedbackScore']]
y = df['EngagementIndex']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['EventType', 'DayOfWeek', 'Season', 'TimeOfDay']),
    ('num', StandardScaler(), ['Attendance', 'FeedbackScore'])
])

engagement_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

engagement_model.fit(X_train, y_train)

with open("engagement_model.pkl", "wb") as f:
    pickle.dump(engagement_model, f)


In [51]:
# feedback_model
X = df[['EventType', 'DayOfWeek', 'Season', 'TimeOfDay', 'VenueCapacity', 'Attendance']]
y = df['FeedbackScore']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['EventType', 'DayOfWeek', 'Season', 'TimeOfDay']),
    ('num', StandardScaler(), ['VenueCapacity', 'Attendance'])
])

feedback_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

feedback_model.fit(X_train, y_train)

with open("feedback_model.pkl", "wb") as f:
    pickle.dump(feedback_model, f)
