In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import MaxAbsScaler


In [3]:
import pandas as pd
import numpy as np

import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

from absl import logging

In [4]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
# Word2Vec
from gensim.models import Word2Vec
# Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
# FastText
from gensim.models import FastText
# Universal Sentence Encoder (USE)
import tensorflow_hub as hub

In [5]:
df = pd.read_csv('/content/drive/MyDrive/amazon_review_small.txt', header=None)


In [6]:
df_test = df.sample(frac=0.2)


In [7]:
df_train = df.drop(df_test.index)


In [8]:
df_test.shape


(130000, 3)

In [9]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 520000 entries, 0 to 649998
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   0       520000 non-null  int64 
 1   1       519979 non-null  object
 2   2       520000 non-null  object
dtypes: int64(1), object(2)
memory usage: 15.9+ MB


In [10]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 130000 entries, 273087 to 300793
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   0       130000 non-null  int64 
 1   1       129995 non-null  object
 2   2       130000 non-null  object
dtypes: int64(1), object(2)
memory usage: 4.0+ MB


In [11]:
df_train.columns


Index([0, 1, 2], dtype='int64')

In [12]:
df_train


Unnamed: 0,0,1,2
0,1,mens ultrasheer,"This model may be ok for sedentary types, but ..."
3,2,Oh dear,I was excited to find a book ostensibly about ...
4,2,Incorrect disc!,"I am a big JVC fan, but I do not like this mod..."
5,2,Incorrect Disc,"I love the style of this, but after a couple y..."
7,3,My 2 y/o grandson loves it!!,This movie with all of its animals really keep...
...,...,...,...
649994,5,New Songs Right on Target,I bought this cd for my husband and he loves i...
649995,5,Pretty Cool!,We got it for our mom's birthday. She LOVES it...
649996,5,great cd,"this cd is very good. i especially love ""cats ..."
649997,2,An interesting look into Boston's comedy clubs,This was a good documentary on the history of ...


In [13]:
df_test


Unnamed: 0,0,1,2
273087,5,Excellent product at a good price,I researched a lot of wall mount units before ...
375392,3,A rare scare for gamers,This was an interesting mystery game that proc...
173602,3,"Too much talk, not enough hockey.","The story was good, but I wanted to see more h..."
104066,3,The bite valve is weak,Overall it's OK. After about a year of regular...
432328,1,not worh it,I tried the shea butter and it was hard to put...
...,...,...,...
400689,2,"Interesting Character Study, but Lacking Subst...",This was an interesting character study of thr...
578352,5,Very satisfied,"Bought this shoes for my boyfriend, and he lov..."
265531,1,Not even close to advertised dimensions,I bought one of these over the holidays to coo...
91415,1,The worst book on the market,This has got to be the worst book ever... I to...


In [14]:
df_train.columns

Index([0, 1, 2], dtype='int64')

In [15]:
## Taking the first 10000 rows due to computation issues
train_txt_1 = df_train.iloc[:10000][2]


In [16]:
train_txt_1

Unnamed: 0,2
0,"This model may be ok for sedentary types, but ..."
3,I was excited to find a book ostensibly about ...
4,"I am a big JVC fan, but I do not like this mod..."
5,"I love the style of this, but after a couple y..."
7,This movie with all of its animals really keep...
...,...
12542,"Having read all the wonderful reviews of ""Frea..."
12543,I bought this puzzle for my daughters two year...
12544,"Melissa & Doug make wonderful, durable product..."
12546,"one of the discs doesn't play. It does, howeve..."


In [17]:
y_1 = df_train.iloc[:10000][0]


In [18]:
y_1 = y_1.to_numpy()


In [21]:
### Taking the first 2000 rows due to computation issues


test_txt_1 = df_test.iloc[:2000][2]
y_2 = df_test.iloc[:2000][0]
y_2 = y_2.to_numpy()


In [22]:
# Using TF-IDF

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(train_txt_1)
X_test_tfidf = tfidf.transform(test_txt_1)

In [23]:
X_test_tfidf.shape

(2000, 5000)

In [20]:
# Using Word2Vec

# Text preprocessing
def preprocess(text):
    return str(text).lower().split()  # Basic tokenization

train_texts = train_txt_1.apply(preprocess)
test_texts = test_txt_1.apply(preprocess)

w2v_model = Word2Vec(train_texts, vector_size=300, window=5, min_count=3, workers=4)

def document_vector(doc):
    doc = [word for word in doc if word in w2v_model.wv]
    return np.mean(w2v_model.wv[doc], axis=0) if doc else np.zeros(300)

X_train_w2v = np.array([document_vector(doc) for doc in train_texts])
X_test_w2v = np.array([document_vector(doc) for doc in test_texts])

In [24]:
X_test_w2v.shape

(2000, 300)

In [25]:
##Using  Doc2vec

tagged_data = [TaggedDocument(doc, [i]) for i, doc in enumerate(train_texts)]
d2v_model = Doc2Vec(vector_size=300, min_count=3, epochs=10)
d2v_model.build_vocab(tagged_data)
d2v_model.train(tagged_data, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

X_train_d2v = np.array([d2v_model.infer_vector(doc) for doc in train_texts])
X_test_d2v = np.array([d2v_model.infer_vector(doc) for doc in test_texts])

In [26]:
X_test_d2v.shape

(2000, 300)

In [27]:
## Using FastText
ft_model = FastText(vector_size=300, window=5, min_count=3, workers=4)
ft_model.build_vocab(train_texts)
ft_model.train(train_texts, total_examples=len(train_texts), epochs=10)

X_train_ft = np.array([np.mean([ft_model.wv[word] for word in doc if word in ft_model.wv] or [np.zeros(300)], axis=0)
                   for doc in train_texts])
X_test_ft = np.array([np.mean([ft_model.wv[word] for word in doc if word in ft_model.wv] or [np.zeros(300)], axis=0)
                  for doc in test_texts])

In [28]:
X_test_ft.shape

(2000, 300)

In [29]:
## Using USE
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

# train_embed_1 = embed(train_txt_1).numpy()


module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [30]:
X_train_use = embed(train_txt_1).numpy()
X_test_use = embed(test_txt_1).numpy()


In [31]:
y_train = df_train.iloc[:10000][0].to_numpy()
y_test = df_test.iloc[:2000][0].to_numpy()


In [37]:
!pip install sentence-transformers -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/483.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━[0m [32m358.4/483.4 kB[0m [31m10.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m483.4/483.4 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [38]:
from sentence_transformers import SentenceTransformer




In [32]:
# Task: Predict textual review embeddings
# Goal: Compare performance across:

# Feature extraction techniques — TF-IDF, Word2Vec, Doc2Vec, FastText, USE.

# Models — Linear Regression, SVM, Random Forest, XGBoost.

# Evaluation metrics: MSE, R², MAE.

In [42]:
model_st = SentenceTransformer('all-MiniLM-L6-v2')
X_train_st = model_st.encode(train_txt_1.tolist(), convert_to_numpy=True)
X_test_st = model_st.encode(test_txt_1.tolist(), convert_to_numpy=True)

In [46]:
import os
import numpy as np

# Drive mount (agar abhi tak nahi kiya)
from google.colab import drive
drive.mount('/content/drive')

# Folder path
embedding_folder = '/content/drive/MyDrive/embedding'
os.makedirs(embedding_folder, exist_ok=True)

# Example embedding variables (aapke actual variables se replace karo)
# X_train_tfidf, X_test_tfidf, X_train_w2v, X_test_w2v, etc.

embedding_vars = {
    "TF-IDF_train": X_train_tfidf,
    "TF-IDF_test": X_test_tfidf,
    "Word2Vec_train": X_train_w2v,
    "Word2Vec_test": X_test_w2v,
    "Doc2Vec_train": X_train_d2v,
    "Doc2Vec_test": X_test_d2v,
    "FastText_train": X_train_ft,
    "FastText_test": X_test_ft,
    "USE_train": X_train_use,
    "USE_test": X_test_use,
    "SentenceTransformer_train": X_train_st,
    "SentenceTransformer_test": X_test_st
}

for name, arr in embedding_vars.items():
    # Agar sparse matrix ho to dense banao
    if hasattr(arr, 'toarray'):
        arr = arr.toarray()
    # Save as .npy file
    np.save(os.path.join(embedding_folder, f"{name}.npy"), arr)
    print(f"Saved {name} at {embedding_folder}/{name}.npy")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved TF-IDF_train at /content/drive/MyDrive/embedding/TF-IDF_train.npy
Saved TF-IDF_test at /content/drive/MyDrive/embedding/TF-IDF_test.npy
Saved Word2Vec_train at /content/drive/MyDrive/embedding/Word2Vec_train.npy
Saved Word2Vec_test at /content/drive/MyDrive/embedding/Word2Vec_test.npy
Saved Doc2Vec_train at /content/drive/MyDrive/embedding/Doc2Vec_train.npy
Saved Doc2Vec_test at /content/drive/MyDrive/embedding/Doc2Vec_test.npy
Saved FastText_train at /content/drive/MyDrive/embedding/FastText_train.npy
Saved FastText_test at /content/drive/MyDrive/embedding/FastText_test.npy
Saved USE_train at /content/drive/MyDrive/embedding/USE_train.npy
Saved USE_test at /content/drive/MyDrive/embedding/USE_test.npy
Saved SentenceTransformer_train at /content/drive/MyDrive/embedding/SentenceTransformer_train.npy
Saved SentenceTransformer_test at /content/drive/MyDriv

In [48]:
models = {
    "Linear Regression": make_pipeline(MaxAbsScaler(), LinearRegression()),
    "SVM": make_pipeline(MaxAbsScaler(), SVR(kernel='rbf', C=1.0)),
    "Random Forest": RandomForestRegressor(n_estimators=100),
    "XGBoost": XGBRegressor(n_estimators=200, max_depth=5, learning_rate=0.1),
}

feature_sets = {
    "TF-IDF": (X_train_tfidf, X_test_tfidf),
    "Word2Vec": (X_train_w2v, X_test_w2v),
    "Doc2Vec": (X_train_d2v, X_test_d2v),
    "FastText": (X_train_ft, X_test_ft),
    "USE": (X_train_use, X_test_use),
    "SentenceTransformer": (X_train_st, X_test_st)
}

results = []

for feat_name, (X_tr, X_te) in feature_sets.items():
    for model_name, model in models.items():
        # Handle sparse matrices
        X_tr_dense = X_tr.toarray() if hasattr(X_tr, 'toarray') else X_tr
        X_te_dense = X_te.toarray() if hasattr(X_te, 'toarray') else X_te

        # Train model
        model.fit(X_tr_dense, y_train)

        # Predict on train and test
        y_train_pred = model.predict(X_tr_dense)
        y_test_pred = model.predict(X_te_dense)

        # Calculate metrics
        train_mse = mean_squared_error(y_train, y_train_pred)
        train_r2 = r2_score(y_train, y_train_pred)
        train_mae = mean_absolute_error(y_train, y_train_pred)

        test_mse = mean_squared_error(y_test, y_test_pred)
        test_r2 = r2_score(y_test, y_test_pred)
        test_mae = mean_absolute_error(y_test, y_test_pred)

        results.append({
            'Features': feat_name,
            'Model': model_name,
            'Train MSE': train_mse,
            'Train R2': train_r2,
            'Train MAE': train_mae,
            'Test MSE': test_mse,
            'Test R2': test_r2,
            'Test MAE': test_mae
        })

results_df = pd.DataFrame(results)
print(results_df.sort_values('Test R2', ascending=False))


               Features              Model  Train MSE  Train R2  Train MAE  \
19                  USE            XGBoost   0.279216  0.860905   0.421506   
17                  USE                SVM   0.496872  0.752478   0.481323   
16                  USE  Linear Regression   0.991630  0.506009   0.798144   
1                TF-IDF                SVM   0.320580  0.840300   0.370030   
18                  USE      Random Forest   0.167182  0.916717   0.330813   
21  SentenceTransformer                SVM   0.588790  0.706688   0.536867   
3                TF-IDF            XGBoost   0.780136  0.611367   0.719706   
20  SentenceTransformer  Linear Regression   1.174555  0.414883   0.883283   
23  SentenceTransformer            XGBoost   0.340272  0.830490   0.474542   
2                TF-IDF      Random Forest   0.196644  0.902040   0.359965   
12             FastText  Linear Regression   1.501594  0.251965   1.022954   
22  SentenceTransformer      Random Forest   0.216376  0.892210 

In [50]:
results_df_traditional_ml_models = results_df

In [51]:
results_df_traditional_ml_models

Unnamed: 0,Features,Model,Train MSE,Train R2,Train MAE,Test MSE,Test R2,Test MAE
0,TF-IDF,Linear Regression,0.461943,0.769878,0.538735,1.994994,-0.042148,1.107011
1,TF-IDF,SVM,0.32058,0.8403,0.37003,1.140006,0.404482,0.863298
2,TF-IDF,Random Forest,0.196644,0.90204,0.359965,1.358746,0.290216,0.94668
3,TF-IDF,XGBoost,0.780136,0.611367,0.719706,1.2283,0.358359,0.907499
4,Word2Vec,Linear Regression,1.427494,0.288879,0.992786,1.552146,0.189188,1.020687
5,Word2Vec,SVM,1.606169,0.199869,1.052003,1.664035,0.130739,1.06657
6,Word2Vec,Random Forest,0.251068,0.874928,0.422707,1.747919,0.08692,1.11071
7,Word2Vec,XGBoost,0.589474,0.706347,0.621703,1.738957,0.091601,1.097799
8,Doc2Vec,Linear Regression,1.530039,0.237795,1.033592,1.59868,0.164879,1.049672
9,Doc2Vec,SVM,1.102848,0.450604,0.784759,1.577368,0.176012,1.024902


In [57]:
results_df_traditional_ml_models[results_df_traditional_ml_models['Test R2']==results_df_traditional_ml_models['Test R2'].max()]

Unnamed: 0,Features,Model,Train MSE,Train R2,Train MAE,Test MSE,Test R2,Test MAE
19,USE,XGBoost,0.279216,0.860905,0.421506,1.058877,0.446862,0.824849


In [58]:
results_df_traditional_ml_models[results_df_traditional_ml_models['Test MAE']==results_df_traditional_ml_models['Test MAE'].min()]

Unnamed: 0,Features,Model,Train MSE,Train R2,Train MAE,Test MSE,Test R2,Test MAE
19,USE,XGBoost,0.279216,0.860905,0.421506,1.058877,0.446862,0.824849


In [59]:
results_df_traditional_ml_models[results_df_traditional_ml_models['Test MSE']==results_df_traditional_ml_models['Test MSE'].min()]

Unnamed: 0,Features,Model,Train MSE,Train R2,Train MAE,Test MSE,Test R2,Test MAE
19,USE,XGBoost,0.279216,0.860905,0.421506,1.058877,0.446862,0.824849


## Hence amongst SVM, Linear Regression, Random Forest and XGBoost, XGBoost is showing lowest errors with USE embeddings

In [72]:
# Your embedding feature sets
feature_sets = {
    "TF-IDF": (X_train_tfidf, X_test_tfidf),
    "Word2Vec": (X_train_w2v, X_test_w2v),
    "Doc2Vec": (X_train_d2v, X_test_d2v),
    "FastText": (X_train_ft, X_test_ft),
    "USE": (X_train_use, X_test_use),
    "SentenceTransformer": (X_train_st, X_test_st)
}

# Hyperparams
n_embd = 64
dropout = 0.3
epochs = 10
batch_size = 32

results = []

for feat_name, (X_tr, X_te) in feature_sets.items():
    print(f"\nTraining on {feat_name} features...")

    # Convert sparse to dense if needed
    X_tr_dense = X_tr.toarray() if hasattr(X_tr, 'toarray') else X_tr
    X_te_dense = X_te.toarray() if hasattr(X_te, 'toarray') else X_te

    # Build model
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(4 * n_embd, input_dim=X_tr_dense.shape[1]),
        tf.keras.layers.ReLU(),
        tf.keras.layers.Dense(n_embd),
        tf.keras.layers.Dropout(dropout),
        tf.keras.layers.Dense(1)  # regression output
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='mse')

    # Train
    model.fit(X_tr_dense, y_train, epochs=epochs, batch_size=batch_size, verbose=0)

    # Predictions on train and test
    y_train_pred = model.predict(X_tr_dense).flatten()
    y_test_pred = model.predict(X_te_dense).flatten()

    # Metrics on train data
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)

    # Metrics on test data
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)

    results.append({
        'Features': feat_name,
        'Model': 'Sequential Regression NN',
        'Train MSE': train_mse,
        'Test MSE': test_mse,
        'Train R2': train_r2,
        'Test R2': test_r2,
        'Train MAE': train_mae,
        'Test MAE': test_mae
    })

# Show results sorted by best test R2
results_df_sequential_1 = pd.DataFrame(results).sort_values('Test R2', ascending=False)
print("\n=== Train vs Test Metrics ===")
results_df_sequential_1



Training on TF-IDF features...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step

Training on Word2Vec features...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 998us/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  

Training on Doc2Vec features...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 981us/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  

Training on FastText features...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  

Training on USE features...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  

Training on SentenceTransformer features...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  

=== Train vs Test Metrics ===


Unnamed: 0,Features,Model,Train MSE,Test MSE,Train R2,Test R2,Train MAE,Test MAE
0,TF-IDF,Sequential Regression NN,0.092063,1.328284,0.954138,0.306129,0.222143,0.90162
4,USE,Sequential Regression NN,0.233677,1.358197,0.883591,0.290503,0.380416,0.927209
5,SentenceTransformer,Sequential Regression NN,0.254988,1.592001,0.872975,0.168368,0.395366,1.003077
3,FastText,Sequential Regression NN,1.525222,1.621262,0.240194,0.153083,1.029466,1.057918
1,Word2Vec,Sequential Regression NN,1.677046,1.720578,0.164562,0.101202,1.09334,1.094755
2,Doc2Vec,Sequential Regression NN,1.104541,1.825815,0.449761,0.046228,0.852623,1.092399


In [73]:
results_df_sequential_1[results_df_sequential_1['Test R2']==results_df_sequential_1['Test R2'].max()]

Unnamed: 0,Features,Model,Train MSE,Test MSE,Train R2,Test R2,Train MAE,Test MAE
0,TF-IDF,Sequential Regression NN,0.092063,1.328284,0.954138,0.306129,0.222143,0.90162


In [74]:
results_df_sequential_1[results_df_sequential_1['Test MAE']==results_df_sequential_1['Test MAE'].min()]

Unnamed: 0,Features,Model,Train MSE,Test MSE,Train R2,Test R2,Train MAE,Test MAE
0,TF-IDF,Sequential Regression NN,0.092063,1.328284,0.954138,0.306129,0.222143,0.90162


In [75]:
results_df_sequential_1[results_df_sequential_1['Test MSE']==results_df_sequential_1['Test MSE'].min()]

Unnamed: 0,Features,Model,Train MSE,Test MSE,Train R2,Test R2,Train MAE,Test MAE
0,TF-IDF,Sequential Regression NN,0.092063,1.328284,0.954138,0.306129,0.222143,0.90162


## For Sequential Model, TF-IDF has lowest errors


In [70]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, ReLU
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pandas as pd

# Your embedding feature sets
feature_sets = {
    "TF-IDF": (X_train_tfidf, X_test_tfidf),
    "Word2Vec": (X_train_w2v, X_test_w2v),
    "Doc2Vec": (X_train_d2v, X_test_d2v),
    "FastText": (X_train_ft, X_test_ft),
    "USE": (X_train_use, X_test_use),
    "SentenceTransformer": (X_train_st, X_test_st)
}

# Hyperparameters
n_embd = 64
dropout = 0.3
epochs = 100  # higher epochs since early stopping will control
batch_size = 32

results = []

for feat_name, (X_tr, X_te) in feature_sets.items():
    print(f"\nTraining on {feat_name} features...")

    # Convert sparse to dense if needed
    X_tr_dense = X_tr.toarray() if hasattr(X_tr, 'toarray') else X_tr
    X_te_dense = X_te.toarray() if hasattr(X_te, 'toarray') else X_te

    # Build model
    model = Sequential([
        Dense(4 * n_embd, input_dim=X_tr_dense.shape[1]),
        ReLU(),
        Dense(n_embd),
        Dropout(dropout),
        Dense(1)  # regression output
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='mse')

    # Early stopping callback
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Train with validation split for early stopping
    model.fit(X_tr_dense, y_train, epochs=epochs, batch_size=batch_size,
              validation_split=0.1, callbacks=[early_stop], verbose=1)

    # Predictions
    y_train_pred = model.predict(X_tr_dense).flatten()
    y_test_pred = model.predict(X_te_dense).flatten()

    # Calculate metrics
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)

    test_mse = mean_squared_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)

    results.append({
        'Features': feat_name,
        'Model': 'Dense NN with EarlyStopping',
        'Train MSE': train_mse,
        'Test MSE': test_mse,
        'Train R2': train_r2,
        'Test R2': test_r2,
        'Train MAE': train_mae,
        'Test MAE': test_mae
    })

# Display results sorted by best test R2 score
results_df_sequential_early_stopping = pd.DataFrame(results).sort_values('Test R2', ascending=False)
print("\n=== Model Performance ===")
results_df_sequential_early_stopping


Training on TF-IDF features...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 36ms/step - loss: 3.1160 - val_loss: 1.1332
Epoch 2/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 33ms/step - loss: 0.8684 - val_loss: 1.2061
Epoch 3/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 32ms/step - loss: 0.6049 - val_loss: 1.2347
Epoch 4/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 32ms/step - loss: 0.3789 - val_loss: 1.3095
Epoch 5/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 32ms/step - loss: 0.2837 - val_loss: 1.3413
Epoch 6/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 32ms/step - loss: 0.2586 - val_loss: 1.3693
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Training on Word2Vec features...
Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 2.5309 - val_loss: 1.9096
Epoch 2/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.9281 - val_loss: 1.8331
Epoch 3/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.8961 - val_loss: 1.8418
Epoch 4/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.8511 - val_loss: 2.0108
Epoch 5/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.8469 - val_loss: 1.7910
Epoch 6/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.8187 - val_loss: 1.8293
Epoch 7/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.8041 - val_loss: 1.7747
Epoch 8/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.8423 - val_loss: 1.8711
Epoch 9/100
[1m282/282[0m [32m━━━━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 3.2238 - val_loss: 1.8141
Epoch 2/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.6836 - val_loss: 1.6931
Epoch 3/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.7016 - val_loss: 1.6507
Epoch 4/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.6356 - val_loss: 1.6328
Epoch 5/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.5384 - val_loss: 1.6653
Epoch 6/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.5209 - val_loss: 1.7023
Epoch 7/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.4494 - val_loss: 1.7438
Epoch 8/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.4178 - val_loss: 1.7221
Epoch 9/100
[1m282/282[0m [32m━━━━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 2.4015 - val_loss: 1.6733
Epoch 2/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.7432 - val_loss: 1.7770
Epoch 3/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.7124 - val_loss: 1.6652
Epoch 4/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.6894 - val_loss: 1.6400
Epoch 5/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.6614 - val_loss: 1.6678
Epoch 6/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.6486 - val_loss: 1.7251
Epoch 7/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.6412 - val_loss: 1.6804
Epoch 8/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.6326 - val_loss: 1.7207
Epoch 9/100
[1m282/282[0m [32m━━━━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 2.7589 - val_loss: 1.1982
Epoch 2/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.1284 - val_loss: 1.1592
Epoch 3/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.0405 - val_loss: 1.1667
Epoch 4/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.9412 - val_loss: 1.1799
Epoch 5/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.8078 - val_loss: 1.2087
Epoch 6/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.6742 - val_loss: 1.2391
Epoch 7/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.5645 - val_loss: 1.2642
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  

Training on Se

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 3.0529 - val_loss: 1.2932
Epoch 2/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.3050 - val_loss: 1.3001
Epoch 3/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.2042 - val_loss: 1.2849
Epoch 4/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.0883 - val_loss: 1.2791
Epoch 5/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.9088 - val_loss: 1.2994
Epoch 6/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.7584 - val_loss: 1.3247
Epoch 7/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.6426 - val_loss: 1.3594
Epoch 8/100
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.5348 - val_loss: 1.4651
Epoch 9/100
[1m282/282[0m [32m━━━━━━━━━━━

Unnamed: 0,Features,Model,Train MSE,Test MSE,Train R2,Test R2,Train MAE,Test MAE
4,USE,Dense NN with EarlyStopping,0.960748,1.09517,0.521393,0.427904,0.789907,0.838643
0,TF-IDF,Dense NN with EarlyStopping,0.762215,1.202477,0.620294,0.371848,0.688467,0.878325
5,SentenceTransformer,Dense NN with EarlyStopping,0.875313,1.304502,0.563953,0.318552,0.755559,0.924507
3,FastText,Dense NN with EarlyStopping,1.599186,1.619828,0.203348,0.153832,1.068698,1.069536
2,Doc2Vec,Dense NN with EarlyStopping,1.494827,1.672526,0.255336,0.126303,1.021734,1.074354
1,Word2Vec,Dense NN with EarlyStopping,1.677124,1.689525,0.164523,0.117423,1.099025,1.091903


In [76]:
results_df_sequential_early_stopping[results_df_sequential_early_stopping['Test R2']==results_df_sequential_early_stopping['Test R2'].max()]

Unnamed: 0,Features,Model,Train MSE,Test MSE,Train R2,Test R2,Train MAE,Test MAE
4,USE,Dense NN with EarlyStopping,0.960748,1.09517,0.521393,0.427904,0.789907,0.838643


In [77]:
results_df_sequential_early_stopping[results_df_sequential_early_stopping['Test MAE']==results_df_sequential_early_stopping['Test MAE'].min()]

Unnamed: 0,Features,Model,Train MSE,Test MSE,Train R2,Test R2,Train MAE,Test MAE
4,USE,Dense NN with EarlyStopping,0.960748,1.09517,0.521393,0.427904,0.789907,0.838643


In [78]:
results_df_sequential_early_stopping[results_df_sequential_early_stopping['Test MSE']==results_df_sequential_early_stopping['Test MSE'].min()]

Unnamed: 0,Features,Model,Train MSE,Test MSE,Train R2,Test R2,Train MAE,Test MAE
4,USE,Dense NN with EarlyStopping,0.960748,1.09517,0.521393,0.427904,0.789907,0.838643


## For Sequential Model with Early Stopping, USE works better and the number of epochs are also increased to 100


## From the above models, we see that the R2 for XGBoost model with USE is highest, then SVM with USE and then we have Dense NN with early stopping with USE.