# The Basics of Supervised NLP Tasks - School Solution

## 0 - Importing the Data

In [1]:
import pandas as pd

In [2]:
# import io
# import requests
# url = "https://github.com/shaypal5/tau_text_mining_24_5/raw/refs/heads/main/lecture_4/kindle_reviews_tau.csv"

In [3]:
# s = requests.get(url).content
# df = pd.read_csv(io.StringIO(s.decode('utf-8')))

In [4]:
df = pd.read_csv('kindle_reviews_tau.csv')  # use this line instead if you've downloaded the dataset directly

In [5]:
df.head(5)

Unnamed: 0,asin,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,total_votes_on_helpfulness,is_helpful_votes,is_helpful_ratio,age,gender,state,yearly_income,is_married,favorite_genre
0,B000FA64PA,5,I think I have this one in both book and audio...,"01 27, 2014",A1ZT7WV0ZUA0OJ,Mike,Audio and book,1390780800,0,0,0.0,23,Male,Utah,43953,False,Horror
1,B000FA64PK,5,This one promises to be another good book. I h...,"01 27, 2014",A1ZT7WV0ZUA0OJ,Mike,my collection,1390780800,1,0,0.0,23,Male,Utah,43953,False,Horror
2,B000FA64QO,5,I was hoping to find this one in book form. Th...,"01 27, 2014",A1ZT7WV0ZUA0OJ,Mike,my e- collection,1390780800,0,0,0.0,23,Male,Utah,43953,False,Horror
3,B000FBFMVG,5,I love the stories with Chewie in them! this e...,"01 27, 2014",A1ZT7WV0ZUA0OJ,Mike,my collection,1390780800,0,0,0.0,23,Male,Utah,43953,False,Horror
4,B000FC26RI,4,As an aspiring yogini this is required reading...,"02 13, 2013",A2Y1X56N8NPH8G,"Heather ""Houndog""",A good resource,1360713600,0,0,0.0,21,Male,Oklahoma,23498,False,Non-Fiction


In [62]:
len(df)

97969

## 1 - Basic, text-less Prediction

### 1.1 - Text-less Prediction w/ Linear Regression

In [6]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [7]:
# Selecting only non-textual features and the target column
features = df[['overall', 'unixReviewTime', 'age', 'gender', 'state', 'yearly_income', 
              'is_married', 'favorite_genre']]
target = df['is_helpful_ratio']

In [8]:
# Splitting the dataset into train, validation, and test sets
train_features, test_features, train_target, test_target = train_test_split(
    features, target, test_size=0.2, random_state=0)
train_features, val_features, train_target, val_target = train_test_split(
    train_features, train_target, test_size=0.25, random_state=0)  # 0.25 x 0.8 = 0.2

In [9]:
# Defining numerical and categorical columns
numeric_features = ['overall', 'unixReviewTime', 'age', 'yearly_income']
categorical_features = ['gender', 'state', 'is_married', 'favorite_genre']

In [10]:
# Creating a column transformer with StandardScaler for numerical features and OneHotEncoder for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [11]:
# Defining the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', LinearRegression())])

In [12]:
# Hyperparameter tuning using GridSearchCV (although Linear Regression doesn't have hyperparameters, this is just to show the process)
param_grid = {
    # No hyperparameters for linear regression to tune
}

In [13]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(train_features, train_target)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['overall',
                                                                          'unixReviewTime',
                                                                          'age',
                                                                          'yearly_income']),
                                                                        ('cat',
                                                                         OneHotEncoder(),
                                                                         ['gender',
                                                                          'state',
                                                                  

In [14]:
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {-grid_search.best_score_}")

Best parameters: {}
Best cross-validation score: 0.19540877344190127


In [15]:
# Evaluating the model on validation data
val_predictions = grid_search.predict(val_features)
val_mse = mean_squared_error(val_target, val_predictions)
print(f"Validation MSE: {val_mse}")

Validation MSE: 0.19655359360785832


In [16]:
# Final evaluation on the test set
test_predictions = grid_search.predict(test_features)
test_mse = mean_squared_error(test_target, test_predictions)
print(f"Test MSE: {test_mse}")

Test MSE: 0.19544328271719746


### 1.2 - Now w/ SVR (Support Vector Regression)

In [17]:
from sklearn.svm import SVR

In [18]:
# Update the pipeline with SVR model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', SVR())])

In [19]:
# # Define hyperparameter search space for SVR
# param_grid = {
#     'model__C': [1],
#     'model__epsilon': [0.1, 0.5],
#     'model__kernel': ['linear', 'rbf']
# }

In [20]:
# # Perform hyperparameter tuning with GridSearchCV
# grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
# grid_search.fit(train_features, train_target)

In [21]:
# print(f"Best parameters: {grid_search.best_params_}")
# print(f"Best cross-validation score: {-grid_search.best_score_}")

In [22]:
# # Evaluating the model on validation data
# val_predictions = grid_search.predict(val_features)
# val_mse = mean_squared_error(val_target, val_predictions)
# print(f"Validation MSE: {val_mse}")

In [23]:
# # Final evaluation on the test set
# test_predictions = grid_search.predict(test_features)
# test_mse = mean_squared_error(test_target, test_predictions)
# print(f"Test MSE: {test_mse}")

### 1.3 - Now w/ Decision Tree Regressors

In [24]:
from sklearn.tree import DecisionTreeRegressor

In [25]:
# Update the pipeline with DecisionTreeRegressor model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', DecisionTreeRegressor(random_state=0))])

In [26]:
# Define hyperparameter search space for DecisionTreeRegressor
param_grid = {
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 10, 20],
    'model__min_samples_leaf': [1, 5, 10]
}

In [27]:
# Perform hyperparameter tuning with GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(train_features, train_target)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         StandardScaler(),
                                                                         ['overall',
                                                                          'unixReviewTime',
                                                                          'age',
                                                                          'yearly_income']),
                                                                        ('cat',
                                                                         OneHotEncoder(),
                                                                         ['gender',
                                                                          'state',
                                                                  

In [28]:
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {-grid_search.best_score_}")

Best parameters: {'model__max_depth': 10, 'model__min_samples_leaf': 10, 'model__min_samples_split': 2}
Best cross-validation score: 0.19775186145229967


In [29]:
# Evaluating the model on validation data
val_predictions = grid_search.predict(val_features)
val_mse = mean_squared_error(val_target, val_predictions)
print(f"Validation MSE: {val_mse}")

Validation MSE: 0.19821015564723315


In [30]:
# Final evaluation on the test set
test_predictions = grid_search.predict(test_features)
test_mse = mean_squared_error(test_target, test_predictions)
print(f"Test MSE: {test_mse}")

Test MSE: 0.1970154070226681


## 2 - Review Text-only Prediciton

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [36]:
# Assuming 'reviewText' and 'is_helpful_ratio' columns exist
text_data = df['reviewText'].fillna("")
target = df['is_helpful_ratio']

In [43]:
# Splitting the dataset
train_texts, test_texts, train_target, test_target = train_test_split(
    text_data, target, test_size=0.2, random_state=0)

### 2.1 - Text-only Prediction w/ BoW

In [44]:
# CountVectorizer for Bag-of-Words model
vectorizer = CountVectorizer(max_features=300)

In [45]:
# Transform text data
train_vectors = vectorizer.fit_transform(train_texts)
val_vectors = vectorizer.transform(val_texts)
test_vectors = vectorizer.transform(test_texts)

In [46]:
# Linear regression model
model = LinearRegression()
model.fit(train_vectors, train_target)

LinearRegression()

In [47]:
# Final evaluation on test data
test_predictions = model.predict(test_vectors)
test_mse = mean_squared_error(test_target, test_predictions)
print(f"Test MSE: {test_mse}")

Test MSE: 0.20477957852470946


### 2.2 - Text-only Prediction w/ TF-IDF

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [49]:
# Replace CountVectorizer with TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=300)

In [50]:
# Transform text data
train_vectors = vectorizer.fit_transform(train_texts)
val_vectors = vectorizer.transform(val_texts)
test_vectors = vectorizer.transform(test_texts)

In [51]:
# Linear regression model
model = LinearRegression()
model.fit(train_vectors, train_target)

LinearRegression()

In [52]:
# Final evaluation on test data
test_predictions = model.predict(test_vectors)
test_mse = mean_squared_error(test_target, test_predictions)
print(f"Test MSE: {test_mse}")

Test MSE: 0.20172754368771417


### 2.2 - Text-only Prediction w/ TF-IDF

In [55]:
from sentence_transformers import SentenceTransformer
import numpy as np

In [59]:
# Replace vectorizer with SentenceTransformer and transform
# model = SentenceTransformer('all-MiniLM-L6-v2')
transformer = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')

In [60]:
# Splitting the dataset
train_texts, test_texts, train_target, test_target = train_test_split(
    text_data, target, test_size=0.2, random_state=0)

In [None]:
train_vectors = np.array(transformer.encode(train_texts.tolist()))
val_vectors = np.array(transformer.encode(val_texts.tolist()))
test_vectors = np.array(transformer.encode(test_texts.tolist()))

In [None]:
# Linear regression model
model = LinearRegression()
model.fit(train_vectors, train_target)

In [None]:
# Final evaluation on test data
test_predictions = model.predict(test_vectors)
test_mse = mean_squared_error(test_target, test_predictions)
print(f"Test MSE: {test_mse}")

## 3 - Text + Demographic Features

In [63]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
import numpy as np

In [64]:
# Fill missing reviewText values
text_data = df['reviewText'].fillna("")
numeric_features = ['overall', 'unixReviewTime', 'age', 'yearly_income']
categorical_features = ['gender', 'state', 'is_married', 'favorite_genre']

In [65]:
# Split the datasets
train_texts, test_texts, train_target, test_target = train_test_split(
    text_data, target, test_size=0.2, random_state=0)
# train_texts, val_texts, train_target, val_target = train_test_split(
#     train_texts, train_target, test_size=0.25, random_state=0)

In [66]:
train_features, test_features, _, _ = train_test_split(
    features, target, test_size=0.2, random_state=0)
# train_features, val_features, _, _ = train_test_split(
#     train_features, target, test_size=0.25, random_state=0)

In [67]:
# Preprocess non-text data using ColumnTransformer
train_non_text_features = non_text_preprocessor.fit_transform(train_features)
val_non_text_features = non_text_preprocessor.transform(val_features)
test_non_text_features = non_text_preprocessor.transform(test_features)

In [68]:
# Dimensionality reduction using TruncatedSVD
svd = TruncatedSVD(n_components=40, random_state=0)
train_reduced_text = svd.fit_transform(train_text_vectors)
val_reduced_text = svd.transform(val_text_vectors)
test_reduced_text = svd.transform(test_text_vectors)

In [None]:
# Since the text features are reduced to a NumPy array, ensure the non-text features are preprocessed separately.





# Continue with fitting the model
model = LinearRegression()
model.fit(train_combined_features, train_target)

# Evaluate
val_predictions = model.predict(val_combined_features)
val_mse = mean_squared_error(val_target, val_predictions)
print(f"Validation MSE: {val_mse}")

test_predictions = model.predict(test_combined_features)
test_mse = mean_squared_error(test_target, test_predictions)
print(f"Test MSE: {test_mse}")

In [69]:
# Combine reduced text and other features
train_combined_features = np.hstack([train_reduced_text, train_features])
val_combined_features = np.hstack([val_reduced_text, val_features])
test_combined_features = np.hstack([test_reduced_text, test_features])

In [70]:
# Define a preprocessor for non-text features
non_text_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [71]:
# Define pipeline
pipeline = Pipeline(steps=[('preprocessor', non_text_preprocessor),
                           ('model', LinearRegression())])

In [72]:
# Fit pipeline
pipeline.fit(train_combined_features, train_target)

ValueError: Specifying the columns using strings is only supported for pandas DataFrames

In [None]:
# Evaluate the model
val_predictions = pipeline.predict(val_combined_features)
val_mse = mean_squared_error(val_target, val_predictions)
print(f"Validation MSE: {val_mse}")

In [None]:














test_predictions = pipeline.predict(test_combined_features)
test_mse = mean_squared_error(test_target, test_predictions)
print(f"Test MSE: {test_mse}")