In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
try:
    from lightgbm import LGBMRegressor
except:
    LGBMRegressor = None

try:
    from xgboost import XGBRegressor
except:
    XGBRegressor = None

df = pd.read_csv("tweet_data_week1_cleaned.csv")

## ✅ Feature Explanation

The following features were selected and engineered based on exploratory analysis and domain knowledge. All are numeric and suitable for machine learning input:

| Feature Name       | Description |
|--------------------|-------------|
| `word_count`       | Total number of words in the tweet. Captures verbosity. |
| `char_count`       | Total characters in the tweet. Reflects length. |
| `sentiment`        | Sentiment polarity score computed via TextBlob (range: -1 to +1). |
| `has_media`        | Boolean flag indicating if the tweet contains media (image/video). |
| `hour`             | Hour of the day when the tweet was posted. Extracted from timestamp. |
| `company_encoded`  | Label-encoded company name (for brand-specific modeling). |
| `username_encoded` | Encoded author handle, useful if individuals matter. |
| `day_encoded`      | Encoded day of the week (Monday = 0 ... Sunday = 6). |
| `has_url`          | Whether the tweet contains a hyperlink. |
| `has_hashtag`      | Whether the tweet contains hashtags. |
| `has_mention`      | Whether the tweet mentions another user using `@`. |

These features were selected for their potential impact on tweet engagement and because they can be derived easily in a backend or API pipeline.

---

In [7]:
# Label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['company_encoded'] = le.fit_transform(df['inferred company'])
df['username_encoded'] = LabelEncoder().fit_transform(df['username'])
df['day_encoded'] = df['day_of_week'].astype('category').cat.codes
df['has_url'] = df['content'].str.contains('http').astype(int)
df['has_hashtag'] = df['content'].str.contains('#').astype(int)
df['has_mention'] = df['content'].str.contains('@').astype(int)


features = ['word_count', 'char_count', 'sentiment', 'has_media', 'hour', 'company_encoded', 'username_encoded', 'day_encoded', 'has_url', 'has_hashtag', 'has_mention']
X = df[features]
y = np.log1p(df['likes'])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(random_state=42),
    "MLPRegressor": Pipeline([
        ('scaler', StandardScaler()),
        ('mlp', MLPRegressor(hidden_layer_sizes=(128, 64, 64, 32), max_iter=500, random_state=42))
    ])
}

if LGBMRegressor:
    models["LGBM"] = LGBMRegressor(random_state=42)
if XGBRegressor:
    models["XGBoost"] = XGBRegressor(random_state=42, verbosity=0)

# Train and Evaluate
results = {}
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    preds = np.expm1(model.predict(X_test))
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    results[name] = rmse
    print(f"{name} RMSE: {rmse:.4f}")

# Sort and display
sorted_results = dict(sorted(results.items(), key=lambda x: x[1]))
print("\n📊 Model Comparison (sorted by RMSE):")
for model, score in sorted_results.items():
    print(f"{model}: {score:.4f}")

Training LinearRegression...
LinearRegression RMSE: 167.1156
Training RandomForest...
RandomForest RMSE: 1263.4606
Training MLPRegressor...
MLPRegressor RMSE: 843.0432
Training LGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002556 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1021
[LightGBM] [Info] Number of data points in the train set: 13864, number of used features: 9
[LightGBM] [Info] Start training from score 3.907053
LGBM RMSE: 759.2875
Training XGBoost...
XGBoost RMSE: 1037.0015

📊 Model Comparison (sorted by RMSE):
LinearRegression: 167.1156
LGBM: 759.2875
MLPRegressor: 843.0432
XGBoost: 1037.0015
RandomForest: 1263.4606


In [15]:
import joblib
joblib.dump(models["LinearRegression"], 'lr_like_predictor.pkl')
joblib.dump(models["RandomForest"], 'rf_like_predictor.pkl')
joblib.dump(models["MLPRegressor"], 'mlp_like_predictor.pkl')
joblib.dump(models["LGBM"], 'lgbm_like_predictor.pkl')
joblib.dump(models["XGBoost"], 'xgb_like_predictor.pkl')

['xgb_like_predictor.pkl']

## ✅ Model Comparison & RMSE (Root Mean Squared Error)

Trained and evaluated several machine learning models using the engineered features and log-transformed like counts (`np.log1p(likes)`) to reduce skew:

| Model              | RMSE (log-likes space) |
|--------------------|------------------------|
| **LinearRegression** | **167.12** |
| LightGBM      | 759.29|
| XGBoost             | 1037.00 |
| RandomForest        | 1263.46 |
| MLPRegressor        | 1498.94 |


---

## ✅ Final Model Choice

We selected **LinearRegression** as the final model due to:

- Lower RMSE on validation set
- Fast training and prediction