In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from imblearn.over_sampling import SMOTE
import sklearn as sk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


df = pd.read_csv('tweets_cleaned_no_outliers.csv')
df.head(5)


In [None]:
print ("pandas " ,pd.__version__)
print ("pandas " ,np.__version__)
print("sklearn", sk.__version__)

In [None]:
df['viral'] = df['Retweets'].apply(lambda x: 1 if x >= 50 else 0)

In [None]:
category_encoded = pd.get_dummies(df['Category'], prefix='cat')
day_dummies = pd.get_dummies(df['DayOfWeek'], prefix='day')
hour_dummies = pd.get_dummies(df['Month'], prefix='month')
df['time'] = pd.to_datetime(df['time'], format='%H:%M:%S').dt.time

# Extract hour, minute, second
df['hour'] = df['time'].apply(lambda t: t.hour)
df['minute'] = df['time'].apply(lambda t: t.minute)
df['second'] = df['time'].apply(lambda t: t.second)
time_features = df[['hour', 'minute', 'second']].to_numpy()


In [None]:
vectorizer = TfidfVectorizer(max_features=1000)
X_text = vectorizer.fit_transform(df['Text_cleaned']).toarray()

# Add numeric metadata (followers, time, etc.)
metadata_features = df[['Followers','Likes']]  # Add relevant columns
X_meta = metadata_features.fillna(0).values


# Combine all features
X = np.hstack((X_text, X_meta, category_encoded,day_dummies,hour_dummies,time_features,))
y = df['viral']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


clf = RandomForestClassifier(class_weight='balanced',n_estimators=100, random_state=42)
clf.fit(X_train_res, y_train_res)

y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


In [None]:


importances = clf.feature_importances_
indices = np.argsort(importances)[-10:]
plt.barh(range(len(indices)), importances[indices])
plt.xlabel("Importance")
plt.title("Top 10 Features")
plt.show()


In [None]:
from imblearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = ['accuracy', 'precision', 'recall', 'f1']

pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('clf', RandomForestClassifier(random_state=42))
])

cv_results = cross_validate(pipeline, X, y, cv=skf, scoring=scoring)

for metric in scoring:
    print(f"{metric}: {cv_results[f'test_{metric}'].mean():.4f}")


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.show()

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:
print(df['viral'].value_counts())


In [None]:
import pickle
with open ('viral.pkl','wb') as file:
    pickle.dump (clf, file)


In [None]:
import pickle
with open ('vector.pkl','wb') as file:
    pickle.dump (vectorizer, file)

In [None]:
import pickle
with open ('category.pkl','wb') as file:
    pickle.dump (category_encoded, file)
with open ('days.pkl','wb') as file:
    pickle.dump (day_dummies, file)
with open ('month.pkl','wb') as file:
    pickle.dump (hour_dummies, file)
with open ('time.pkl','wb') as file:
    pickle.dump (time_features, file)