In [94]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [95]:
cols = ['event_name', 'event_location', 'event_summary', 'event_description', 'category']
df = pd.read_csv('events.csv', names=cols, skiprows=1)
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,event_name,event_location,event_summary,event_description,category
0,No Speakers Conference,Empty Auditorium,,,Others
1,Afternoon Jazz Brunch,Hotel Terrace,"Live jazz band, buffet brunch","Mingle, sip coffee, enjoy pleasant melodies.",Social
2,Vintage Car Parade,Main Street,Watch classic cars,"Talk to owners about restorations, car history.",Social
3,Introvert Hangout,Small Reading Nook,Silent shared space,Relax together without forced conversation.,Personal
4,Standing Pilates,Wellness Balcony,Low-impact standing moves,"Improve posture, leg and core strength without...",Fitness


# Drop the feature with minimal predictive power

In [96]:
df.drop('event_location', axis=1, inplace=True)

In [97]:
df.head()

Unnamed: 0,event_name,event_summary,event_description,category
0,No Speakers Conference,,,Others
1,Afternoon Jazz Brunch,"Live jazz band, buffet brunch","Mingle, sip coffee, enjoy pleasant melodies.",Social
2,Vintage Car Parade,Watch classic cars,"Talk to owners about restorations, car history.",Social
3,Introvert Hangout,Silent shared space,Relax together without forced conversation.,Personal
4,Standing Pilates,Low-impact standing moves,"Improve posture, leg and core strength without...",Fitness


# Preprocess the event_name, event_summary, event_description into numerical values

In [98]:
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import string
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [99]:
def preprocess_text(text):
    # Lowercase text
    text = text.lower()
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    cleaned_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(cleaned_words)

In [100]:
df.fillna('missing', inplace=True)

In [101]:
df['combined_text'] = df['event_name'] + ' ' + df['event_summary'] + ' ' + df['event_description']
df['combined_text'] = df['combined_text'].apply(preprocess_text)

In [102]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])

In [103]:
import pickle

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [113]:
label_encoder = LabelEncoder()
df['encoded_category'] = label_encoder.fit_transform(df['category'])
category_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

print("Category Mapping:")
for label, encoding in category_mapping.items():
    print(f"{label}: {encoding}")

Category Mapping:
Academics: 0
Fitness: 1
Others: 2
Personal: 3
Social: 4
Work: 5


In [105]:
df

Unnamed: 0,event_name,event_summary,event_description,category,combined_text,encoded_category
0,No Speakers Conference,missing,missing,Others,speaker conference missing missing,2
1,Afternoon Jazz Brunch,"Live jazz band, buffet brunch","Mingle, sip coffee, enjoy pleasant melodies.",Social,afternoon jazz brunch live jazz band buffet br...,4
2,Vintage Car Parade,Watch classic cars,"Talk to owners about restorations, car history.",Social,vintage car parade watch classic car talk owne...,4
3,Introvert Hangout,Silent shared space,Relax together without forced conversation.,Personal,introvert hangout silent shared space relax to...,3
4,Standing Pilates,Low-impact standing moves,"Improve posture, leg and core strength without...",Fitness,standing pilate lowimpact standing move improv...,1
...,...,...,...,...,...,...
561,Hobby Candle Carving,Carve patterns into candles,"Relaxing, decorative hobby, customize gifts.",Personal,hobby candle carving carve pattern candle rela...,3
562,Intro to Sociology,"Societal structures, norms","Discuss cultures, socialization, group behavior.",Academics,intro sociology societal structure norm discus...,0
563,Kick-Start Boot Camp,All-levels circuit training,"Mix cardio, strength, plyometrics for a full-b...",Fitness,kickstart boot camp alllevels circuit training...,1
564,Introduction to Logic,"Reasoning, fallacies, proofs",Learn symbolic logic and argument structure.,Academics,introduction logic reasoning fallacy proof lea...,0


# Split the data

In [106]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, df['encoded_category'], test_size=0.2, random_state=42)

In [107]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.88      0.81        17
           1       0.88      0.84      0.86        25
           2       1.00      0.80      0.89        10
           3       0.48      0.65      0.55        20
           4       0.67      0.57      0.62        21
           5       0.88      0.71      0.79        21

    accuracy                           0.74       114
   macro avg       0.78      0.74      0.75       114
weighted avg       0.76      0.74      0.74       114



In [108]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.82      0.78        17
           1       1.00      0.96      0.98        25
           2       0.80      0.80      0.80        10
           3       0.63      0.60      0.62        20
           4       0.59      0.62      0.60        21
           5       0.95      0.90      0.93        21

    accuracy                           0.79       114
   macro avg       0.78      0.78      0.78       114
weighted avg       0.79      0.79      0.79       114



In [109]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print(classification_report(y_test, y_pred))

with open('nb_model.pkl', 'wb') as f:
    pickle.dump(nb, f)

              precision    recall  f1-score   support

           0       0.88      0.88      0.88        17
           1       0.92      0.88      0.90        25
           2       1.00      0.80      0.89        10
           3       0.68      0.75      0.71        20
           4       0.62      0.62      0.62        21
           5       0.95      1.00      0.98        21

    accuracy                           0.82       114
   macro avg       0.84      0.82      0.83       114
weighted avg       0.83      0.82      0.83       114



In [110]:
from sklearn.svm import SVC
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.88      0.81        17
           1       0.96      0.88      0.92        25
           2       1.00      0.80      0.89        10
           3       0.61      0.70      0.65        20
           4       0.64      0.67      0.65        21
           5       0.94      0.81      0.87        21

    accuracy                           0.79       114
   macro avg       0.82      0.79      0.80       114
weighted avg       0.81      0.79      0.79       114



In [111]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.88      0.81        17
           1       0.96      0.88      0.92        25
           2       1.00      0.80      0.89        10
           3       0.62      0.65      0.63        20
           4       0.65      0.62      0.63        21
           5       0.91      0.95      0.93        21

    accuracy                           0.80       114
   macro avg       0.81      0.80      0.80       114
weighted avg       0.81      0.80      0.80       114



In [112]:
df['category'].unique()

array(['Others', 'Social', 'Personal', 'Fitness', 'Work', 'Academics'],
      dtype=object)