In [39]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from catboost import CatBoostRegressor, Pool
from scipy.stats import pearsonr
import matplotlib.pyplot as plt


In [40]:
df = pd.read_csv("final_data_with_classification_target.csv")

In [41]:
df['rating_deviation'] = df['rating_number'] - df['average_rating']

In [42]:
def parse_embedding(embed, dim):
    try:
        if isinstance(embed, str):
            embed_str = embed.replace('e+', 'e').replace('[', '').replace(']', '').strip()
            embed = [float(x) for x in embed_str.split() if x != '...']
            embed = np.array(embed, dtype=np.float32)
        elif isinstance(embed, (list, np.ndarray, torch.Tensor)):
            embed = np.array(embed, dtype=np.float32)
        else:
            embed = np.zeros(dim, dtype=np.float32)
        if embed.shape[0] != dim:
            embed = np.zeros(dim, dtype=np.float32)
        if not np.all(np.isfinite(embed)):
            embed = np.zeros(dim, dtype=np.float32)
        return embed
    except Exception as e:
        print(f"Error parsing embedding: {e}")
        return np.zeros(dim, dtype=np.float32)

In [43]:
bert_dim = 768
img_dim = 512
df['bert_embedding'] = df['bert_embedding'].apply(lambda x: parse_embedding(x, bert_dim))
df['image_embedding'] = df['image_embedding'].apply(lambda x: parse_embedding(x, img_dim))

In [44]:
bert_pca = PCA(n_components=50, random_state=42)
img_pca = PCA(n_components=50, random_state=42)
bert_reduced = bert_pca.fit_transform(np.stack(df['bert_embedding'].values))
img_reduced = img_pca.fit_transform(np.stack(df['image_embedding'].values))
df['bert_embedding'] = list(bert_reduced)
df['image_embedding'] = list(img_reduced)


In [45]:
df['verified_purchase'] = df['verified_purchase'].astype(int)
df['rating_number'] = df['rating_number'].astype(float)
df['days_since_review'] = df['days_since_review'].astype(float)
df['avg_quality_score'] = df['avg_quality_score'].astype(float)
df['average_rating'] = df['average_rating'].astype(float)

In [46]:
meta_features = ['verified_purchase', 'avg_quality_score', 'days_since_review', 'rating_number', 'average_rating']
text_features = ['sentiment', 'readability', 'review_length', 'punctuation_count']
new_features = ['rating_deviation']
for col in meta_features + text_features + new_features:
    if col in df.columns:
        df[col] = df[col].fillna(0).replace([np.inf, -np.inf], 0)

In [47]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [48]:
scaler = StandardScaler()
features_to_scale = text_features + meta_features + new_features
features_to_scale = [f for f in features_to_scale if f in df.columns]
train_df[features_to_scale] = scaler.fit_transform(train_df[features_to_scale])
test_df[features_to_scale] = scaler.transform(test_df[features_to_scale])

In [49]:
def prepare_features(df):
    bert_embeds = np.stack(df['bert_embedding'].values)
    img_embeds = np.stack(df['image_embedding'].values)
    other_feats = df[features_to_scale].values
    features = np.concatenate([bert_embeds, img_embeds, other_feats], axis=1)
    return features


In [50]:
X_train = prepare_features(train_df)
X_test = prepare_features(test_df)

In [51]:
# if 'helpful_vote' in df.columns:
#     max_votes = df['helpful_vote'].max()
#     y_train = train_df['helpful_vote'] / max_votes if max_votes > 0 else train_df['helpful_vote']
#     y_test = test_df['helpful_vote'] / max_votes if max_votes > 0 else test_df['helpful_vote']

if 'helpful_vote' in df.columns:
    max_votes = df['helpful_vote'].max()
    y_train = np.log1p(train_df['helpful_vote']) / np.log1p(max_votes) if max_votes > 0 else train_df['helpful_vote']
    y_test = np.log1p(test_df['helpful_vote']) / np.log1p(max_votes) if max_votes > 0 else test_df['helpful_vote']

In [52]:
catboost_model = CatBoostRegressor(
    iterations=500,
    depth=6,
    learning_rate=0.1,
    random_seed=42,
    verbose=50,
    early_stopping_rounds=50,
    task_type='CPU'
)

In [53]:
train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)
catboost_model.fit(train_pool, eval_set=test_pool)

0:	learn: 0.1381226	test: 0.1380375	best: 0.1380375 (0)	total: 21.5ms	remaining: 10.7s
50:	learn: 0.1273442	test: 0.1315786	best: 0.1315786 (50)	total: 370ms	remaining: 3.26s
100:	learn: 0.1225396	test: 0.1308985	best: 0.1308524 (96)	total: 696ms	remaining: 2.75s
150:	learn: 0.1171076	test: 0.1306955	best: 0.1306614 (146)	total: 1.17s	remaining: 2.7s
200:	learn: 0.1122412	test: 0.1306329	best: 0.1306003 (196)	total: 1.55s	remaining: 2.31s
250:	learn: 0.1075495	test: 0.1306583	best: 0.1305943 (244)	total: 1.99s	remaining: 1.97s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.1305942804
bestIteration = 244

Shrink model to first 245 iterations.


<catboost.core.CatBoostRegressor at 0x12e19b200>

In [59]:
def evaluate_model(model, X, y):
    preds = model.predict(X)
    mse = mean_squared_error(y, preds)
    mae = mean_absolute_error(y, preds)
    r2 = r2_score(y, preds)
    pearson_corr, _ = pearsonr(y, preds)
    print(f"Evaluation Results:\nMSE: {mse:.4f}\nMAE: {mae:.4f}\nR²: {r2:.4f}\nPearson Correlation: {pearson_corr:.4f}")
    return mse, mae, r2

In [60]:
print("Test Set Evaluation:")
mse, mae, r2 = evaluate_model(catboost_model, X_test, y_test)

Test Set Evaluation:
Evaluation Results:
MSE: 0.0171
MAE: 0.1020
R²: 0.1142
Pearson Correlation: 0.3380


In [61]:
# Save the model and preprocessing objects
catboost_model.save_model('catboost_model.cbm')  # Save CatBoost model
import pickle
pickle.dump(scaler, open('scaler.pkl', 'wb'))  # Save scaler
pickle.dump(bert_pca, open('bert_pca.pkl', 'wb'))  # Save BERT PCA
pickle.dump(img_pca, open('img_pca.pkl', 'wb'))  # Save image PCA
print("Model and preprocessing objects saved successfully!")

Model and preprocessing objects saved successfully!
