In [None]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [None]:
!pip install xgboost



In [None]:
import pandas as pd
from datetime import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_excel('simulation.xlsx')

In [None]:
df = df.drop(columns=['id'])

In [None]:
# Ensure all entries in the 'date' column are datetime objects
df['date'] = pd.to_datetime(df['date'])

def check_weekday_or_weekend(date_time_obj):
    day_of_week = date_time_obj.weekday()
    return 1 if day_of_week >= 5 else 0

df['is_weekend'] = df['date'].apply(check_weekday_or_weekend)

In [None]:
def time_of_day(date_time_obj):
    hour = date_time_obj.hour
    if 6 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 18:
        return 'Afternoon/Evening'
    else:
        return 'Night'

df['time_of_day'] = df['date'].apply(time_of_day)

In [None]:
le_username = LabelEncoder()
le_company = LabelEncoder()
le_time_of_day = LabelEncoder()

# Fit and transform the categorical features using LabelEncoder
df['username_encoded'] = le_username.fit_transform(df['username'])
df['company_encoded'] = le_company.fit_transform(df['inferred company'])
df['time_of_day_encoded'] = le_time_of_day.fit_transform(df['time_of_day'])

In [None]:
analyzer = SentimentIntensityAnalyzer()
df['sentiment'] = df['content'].apply(lambda text: analyzer.polarity_scores(text)['compound'])

In [None]:
def media_influence(media_link):
    if 'video' in media_link.lower():
        return 1  # Video
    elif 'image' in media_link.lower() or 'jpg' in media_link.lower() or 'png' in media_link.lower():
        return 0  # Image
    else:
        return None  # Undefined, handle missing cases

df['media_influence'] = df['media'].apply(lambda x: media_influence(x.split(",")[0]))

In [None]:
tfidf = TfidfVectorizer(max_features=1000)  # Limit to top 1000 words
tfidf_matrix = tfidf.fit_transform(df['content'])

# Convert the TF-IDF matrix for each row into a list or NumPy array and store in a new column
df['tfidf_vector'] = list(tfidf_matrix.toarray())  # Storing each row's TF-IDF vector as a list

In [None]:
X = df.drop(columns=['date', 'content', 'media', 'username', 'inferred company', 'likes', 'time_of_day'])  # Features
y = df['likes']  # Target

In [None]:
from sklearn.decomposition import PCA

# Apply PCA to reduce dimensionality of the TF-IDF vectors
pca = PCA(n_components=50)  # Reduce to 50 components (adjust based on your needs)
tfidf_reduced = pca.fit_transform(tfidf_matrix.toarray())

# Convert back to DataFrame and concatenate with other features
tfidf_df = pd.DataFrame(tfidf_reduced, index=df.index)
X = pd.concat([df.drop(columns=['tfidf_vector']), tfidf_df], axis=1)

In [None]:
X = df.drop(columns=['date', 'content', 'media', 'username', 'inferred company', 'likes', 'time_of_day','tfidf_vector'])  # Features

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split

# Split the data (you already have this)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and train RandomForest model with GridSearchCV
rf_model = RandomForestRegressor(random_state=42)

# Simplified parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],  # Reduced number of trees
    'max_depth': [5, 10, 15],         # Fewer depths
    'min_samples_split': [2, 5],      # Less variation here
    'min_samples_leaf': [1, 2],       # Standard options
}

# Apply GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best model
best_rf_model = grid_search.best_estimator_
print("Best parameters found: ", grid_search.best_params_)


Best parameters found:  {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}


In [None]:
X.head()

Unnamed: 0,is_weekend,username_encoded,company_encoded,time_of_day_encoded,sentiment,media_influence
0,1,790,162,2,0.5093,0
1,1,404,85,1,0.1779,0
2,0,137,34,2,-0.5574,0
3,0,462,185,1,-0.6124,0
4,0,100,85,0,0.8395,0


In [None]:
y_pred = grid_search.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Test RMSE: {rmse}")

Test RMSE: 5510.174847746202




In [None]:
!pip install joblib



In [None]:
import joblib

# Save the best model
joblib.dump(best_rf_model, 'best_rf_model.pkl')

['best_rf_model.pkl']

In [None]:
# Save the label encoders
joblib.dump(le_username, 'le_username.pkl')
joblib.dump(le_company, 'le_company.pkl')
joblib.dump(le_time_of_day, 'le_time_of_day.pkl')

['le_time_of_day.pkl']

In [None]:
# Save the TF-IDF Vectorizer
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [None]:
# Save the PCA model
joblib.dump(pca, 'pca_model.pkl')

['pca_model.pkl']

In [None]:
from google.colab import files

# Download all saved files
files.download('best_rf_model.pkl')
files.download('le_username.pkl')
files.download('le_company.pkl')
files.download('le_time_of_day.pkl')
files.download('tfidf_vectorizer.pkl')
files.download('pca_model.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>