In [None]:
!pip install kaggle
!pip install transformers
!pip install torch
!pip install scipy

In [None]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d snap/amazon-fine-food-reviews

In [None]:
!unzip amazon-fine-food-reviews.zip

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import vstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from scipy.sparse import csr_matrix
from transformers import pipeline, RobertaTokenizerFast

import nltk
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import torch
import tensorflow as tf

In [None]:
# Load the dataset
df = pd.read_csv("Reviews.csv")

# Convert scores to labels
def create_label(score):
    if score >= 4:
        return "positive"
    elif score < 4:
        return "negative"

df["label"] = df["Score"].apply(create_label)

# Keep only positive and negative labels
df = df[df["label"].isin(["positive", "negative"])]

# Undersample the positive reviews
positive_count = df[df["label"] == "positive"].shape[0]
negative_count = df[df["label"] == "negative"].shape[0]
if positive_count > negative_count:
    df = pd.concat([
        df[df["label"] == "positive"].sample(negative_count, random_state=42),
        df[df["label"] == "negative"]
    ], axis=0)
else:
    df = pd.concat([
        df[df["label"] == "positive"],
        df[df["label"] == "negative"].sample(positive_count, random_state=42)
    ], axis=0)

# Select a random subset of samples between 1000 and 10000
num_samples = np.random.randint(1000, 10000)
df = df.sample(num_samples, random_state=42)

# Keep the nine most frequent labels and combine the less popular labels into an "Other" label
top_labels = df["label"].value_counts().nlargest(9).index
df.loc[~df["label"].isin(top_labels), "label"] = "Other"
df.shape[0]

# Split the dataset into training, validation, and test sets with a 60/20/20% split
train_val_test_split = [0.6, 0.2, 0.2]
num_samples = len(df)
num_train_samples = int(train_val_test_split[0] * num_samples)
num_val_samples = int(train_val_test_split[1] * num_samples)

df = df.sample(frac=1, random_state=42) # Shuffle the dataset
train_df = df[:num_train_samples]
val_df = df[num_train_samples:num_train_samples+num_val_samples]
test_df = df[num_train_samples+num_val_samples:]

# Table with label counts for each split of the dataset
label_counts = pd.DataFrame({
    "train": train_df["label"].value_counts(),
    "val": val_df["label"].value_counts(),
    "test": test_df["label"].value_counts()
}).transpose()

print(label_counts)
df.shape

In [None]:
# Step 0: Vectorize text
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df['Text'])
X_val = vectorizer.transform(val_df['Text'])
X_test = vectorizer.transform(test_df['Text'])
X_all = vstack([X_train, X_val, X_test])

# Step 1: Pick k random centroids
k = 5
np.random.seed(42)
X_train_dense = X_train.toarray()
centroids = X_train_dense[np.random.choice(X_train_dense.shape[0], k, replace=False), :]


# Step 2 and 3: Assign vectors to their closest centroid and recalculate centroids
for i in range(10):  # repeat steps 2 and 3 for 10 iterations
    # calculate distance between each vector and each centroid
    dist = np.array([np.linalg.norm(X_all.toarray() - centroid, axis=1) for centroid in centroids]).T
    # assign each vector to the closest centroid
    cluster_labels = np.argmin(dist, axis=1)
    # calculate new centroids based on the mean of vectors in each cluster
    new_centroids = np.zeros_like(centroids)
    for j in range(k):
        cluster_samples = X_all[cluster_labels == j].toarray()
        if cluster_samples.shape[0] > 0:
            cluster_samples = X_all[cluster_labels == j].toarray()
            new_centroids[j] = cluster_samples.mean(axis=0)
        elif cluster_samples.shape[0] == 0:
            new_centroids[j] = centroids[j]
        else:
            new_centroids[j] = centroids[j]

        
    # check if clusters have converged
    if np.allclose(new_centroids, centroids):
        break
    centroids = new_centroids
    
# Assign cluster labels to each document
train_cluster_labels = np.argmin(np.array([np.linalg.norm(X_train - centroid, axis=1) for centroid in centroids]).T, axis=1)
val_cluster_labels = np.argmin(np.array([np.linalg.norm(X_val - centroid, axis=1) for centroid in centroids]).T, axis=1)
test_cluster_labels = np.argmin(np.array([np.linalg.norm(X_test - centroid, axis=1) for centroid in centroids]).T, axis=1)

# For each cluster, print out some example documents and the top 5 tokens in the corresponding centroid
for cluster_num in range(2):
    print(f"Cluster {cluster_num}:")
    train_examples = train_df.iloc[train_cluster_labels == cluster_num].sample(5)["Text"].tolist()
    val_examples = val_df.iloc[val_cluster_labels == cluster_num].sample(5)["Text"].tolist()
    test_examples = test_df.iloc[test_cluster_labels == cluster_num].sample(5)["Text"].tolist()
    print(f"Training examples:\n{train_examples}\n")
    print(f"Validation examples:\n{val_examples}\n")
    print(f"Test examples:\n{test_examples}\n")
    top_tokens = np.array(vectorizer.get_feature_names_out())[np.argsort(-centroids[cluster_num])[:5]]
    print(f"Top 5 tokens in centroid: {top_tokens}\n\n")

In [None]:
# Step 0: Vectorize text
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df['label'])
X_val = vectorizer.transform(val_df['label'])
X_test = vectorizer.transform(test_df['label'])
X_all = vstack([X_train, X_val, X_test])

# Step 1: Train KMeans model
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X_all)

# Step 2: Get predicted cluster labels for test data
y_kmeans = kmeans.predict(X_test)


# Step 3: Construct confusion matrix
label_map = {label: i for i, label in enumerate(train_df['label'].unique())}
y_true = test_df['label'].map(label_map)

cm = confusion_matrix(y_true, y_kmeans)
print(cm)

In [None]:
fig, ax = plt.subplots()
im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
ax.figure.colorbar(im, ax=ax)
ax.set(xticks=np.arange(cm.shape[1]),
       yticks=np.arange(cm.shape[0]),
       xticklabels=train_df['label'].unique(), 
       yticklabels=train_df['label'].unique(),
       xlabel='Predicted label',
       ylabel='True label')

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
fmt = '.2f'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(j, i, format(cm[i, j], fmt),
                ha="center", va="center",
                color="white" if cm[i, j] > thresh else "black")
fig.tight_layout()
plt.show()

In [None]:
# Define the feature representations
cv = CountVectorizer()
tfidf = TfidfVectorizer()

# Fit and transform the training set with each feature representation
X_train_cv = cv.fit_transform(train_df["Text"])
X_train_tfidf = tfidf.fit_transform(train_df["Text"])

# Transform the validation set with each feature representation
X_val_cv = cv.transform(val_df["Text"])
X_val_tfidf = tfidf.transform(val_df["Text"])

# Define the Dummy Classifiers
dc_most_frequent = DummyClassifier(strategy="most_frequent", random_state=42)
dc_stratified = DummyClassifier(strategy="stratified", random_state=42)

# Define the Logistic Regression classifiers
lr_cv = LogisticRegression(max_iter=10000)
lr_tfidf = LogisticRegression(max_iter=10000)

# Fit the classifiers on the training set
dc_most_frequent.fit(X_train_cv, train_df["label"])
dc_stratified.fit(X_train_cv, train_df["label"])

lr_cv.fit(X_train_cv, train_df["label"])
lr_tfidf.fit(X_train_tfidf, train_df["label"])

# Evaluate the classifiers on the validation set
y_val_true = val_df["label"]

y_val_pred_dc_most_frequent = dc_most_frequent.predict(X_val_cv)
y_val_pred_dc_stratified = dc_stratified.predict(X_val_cv)

y_val_pred_lr_cv = lr_cv.predict(X_val_cv)
y_val_pred_lr_tfidf = lr_tfidf.predict(X_val_tfidf)

acc_dc_most_frequent = accuracy_score(y_val_true, y_val_pred_dc_most_frequent)
acc_dc_stratified = accuracy_score(y_val_true, y_val_pred_dc_stratified)
acc_lr_cv = accuracy_score(y_val_true, y_val_pred_lr_cv)
acc_lr_tfidf = accuracy_score(y_val_true, y_val_pred_lr_tfidf)

print("Dummy Classifier with strategy='most_frequent' accuracy:", acc_dc_most_frequent)
print("Dummy Classifier with strategy='stratified' accuracy:", acc_dc_stratified)
print("LogisticRegression with One-hot vectorization accuracy:", acc_lr_cv)
print("LogisticRegression with TF-IDF vectorization accuracy:", acc_lr_tfidf)

In [None]:
# Define the classifiers
dummy_clf_frequent = DummyClassifier(strategy="most_frequent")
dummy_clf_stratified = DummyClassifier(strategy="stratified")
logreg_clf_onehot = LogisticRegression(max_iter=1000)
logreg_clf_tfidf = LogisticRegression(max_iter=1000)
svm_clf_onehot = SVC()

# Vectorize the text data
vectorizer_onehot = CountVectorizer(binary=True)
X_train_onehot = vectorizer_onehot.fit_transform(train_df["Text"])
X_val_onehot = vectorizer_onehot.transform(val_df["Text"])

vectorizer_tfidf = TfidfVectorizer()
X_train_tfidf = vectorizer_tfidf.fit_transform(train_df["Text"])
X_val_tfidf = vectorizer_tfidf.transform(val_df["Text"])

# Fit the classifiers
dummy_clf_frequent.fit(X_train_onehot, train_df["label"])
dummy_clf_stratified.fit(X_train_onehot, train_df["label"])
logreg_clf_onehot.fit(X_train_onehot, train_df["label"])
logreg_clf_tfidf.fit(X_train_tfidf, train_df["label"])
svm_clf_onehot.fit(X_train_onehot, train_df["label"])

# Evaluate the classifiers
def evaluate_clf(clf, X, y_true):
    y_pred = clf.predict(X)
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average="macro", zero_division=0)
    recall = recall_score(y_true, y_pred, average="macro", zero_division=0)
    f1 = f1_score(y_true, y_pred, average="macro")
    return acc, precision, recall, f1

dummy_frequent_acc, dummy_frequent_prec, dummy_frequent_rec, dummy_frequent_f1 = evaluate_clf(dummy_clf_frequent, X_val_onehot, val_df["label"])
dummy_stratified_acc, dummy_stratified_prec, dummy_stratified_rec, dummy_stratified_f1 = evaluate_clf(dummy_clf_stratified, X_val_onehot, val_df["label"])
logreg_onehot_acc, logreg_onehot_prec, logreg_onehot_rec, logreg_onehot_f1 = evaluate_clf(logreg_clf_onehot, X_val_onehot, val_df["label"])
logreg_tfidf_acc, logreg_tfidf_prec, logreg_tfidf_rec, logreg_tfidf_f1 = evaluate_clf(logreg_clf_tfidf, X_val_tfidf, val_df["label"])
svm_onehot_acc, svm_onehot_prec, svm_onehot_rec, svm_onehot_f1 = evaluate_clf(svm_clf_onehot, X_val_onehot, val_df["label"])

# Print the evaluation metrics
print("Dummy Classifier with strategy='most_frequent' accuracy: {:.4f}".format(dummy_frequent_acc))
print("Dummy Classifier with strategy='most_frequent' macro-averaged precision: {:.4f}".format(dummy_frequent_prec))
print("Dummy Classifier with strategy='most_frequent' macro-averaged recall: {:.4f}".format(dummy_frequent_rec))
print("Dummy Classifier with strategy='most_frequent' macro-averaged F1: {:.4f}".format(dummy_frequent_f1))
print()
print("Dummy Classifier with strategy='stratified' accuracy: {:.4f}".format(dummy_stratified_acc))

In [None]:
# Set the F1 scores
f1_scores = [dummy_frequent_f1, dummy_stratified_f1, logreg_onehot_f1, logreg_tfidf_f1, svm_onehot_f1]

# Set the x labels
x_labels = ['Dummy Classifier\n(strategy=\'most_frequent\')', 'Dummy Classifier\n(strategy=\'stratified\')', 'Logistic Regression\n(with one-hot encoding)', 'Logistic Regression\n(with TF-IDF encoding)', 'SVM\n(with one-hot encoding)']

# Create the bar chart
plt.figure(figsize=(10,6))
plt.bar(x_labels, f1_scores, color=['#1f77b0', '#1f77b4', '#1f77b9', '#1f77b7', '#1f77b6'])
plt.ylim([0,1])
plt.ylabel('F1 Score', fontsize=12)
plt.xticks(fontsize=12, rotation=30, ha='right')
plt.title('Comparison of F1 scores of classifiers', fontsize=16)

plt.show()

In [None]:
#comparing classifiers part b (Naive Bayes algorithm)
# Preprocessing
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

df['text'] = df['Text'].apply(preprocess)
df['rating'] = df['Score'].apply(lambda x: 1 if x > 3 else 0)  # Convert 5-star rating to binary label

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['rating'], test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = MultinomialNB()
clf.fit(X_train_vec, y_train)
y_pred = clf.predict(X_test_vec)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))

In [None]:
X_trainval, X_test, y_trainval, y_test = train_test_split(df['Text'], df['Score'], test_size=0.2, random_state=42, stratify=df['Score'])
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.25, random_state=42, stratify=y_trainval)

In [None]:
# Select a random subset of samples between 1000 and 10000
num_samples = np.random.randint(1000, 10000)
df = df.sample(num_samples, random_state=42)

# Convert the 'Text' column to string type
df['Text'] = df['Text'].astype(str)

# Preprocess the text data
df['Text'] = df['Text'].str.lower()  # Convert text to lowercase
df['Text'] = df['Text'].str.replace('[^\w\s]', '', regex=True)  # Remove punctuation
df['Text'] = df['Text'].str.replace('\d+', '', regex=True)  # Remove digits
df['Text'] = df['Text'].str.strip()  # Remove leading and trailing whitespaces

# Reduce the number of rows
df = df.head(5000)


In [None]:
X_trainval, X_test, y_trainval, y_test = train_test_split(df['Text'], df['Score'], test_size=0.2, random_state=42, stratify=df['Score'])
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.25, random_state=42, stratify=y_trainval)

lr_tfidf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression())
])

parameters = {
    'tfidf__sublinear_tf': [True, False],
    'tfidf__max_features': [None, 10000, 20000, 30000, 40000, 50000],
    'tfidf__ngram_range': [(1,1), (1,2), (1,3)],
    'lr__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

In [None]:
grid_search_lr_tfidf = GridSearchCV(lr_tfidf, parameters, cv=5, n_jobs=-1, scoring='accuracy', 
                                     pre_dispatch='2*n_jobs')

In [None]:
# Fit the model on sparse matrices
print("test run")
grid_search_lr_tfidf.fit(X_trainval, y_trainval)

# Print the best parameters and the corresponding accuracy score
print("Best parameters: ", grid_search_lr_tfidf.best_params_)
print("Best accuracy score: ", grid_search_lr_tfidf.best_score_)

In [None]:

df = df.sample(frac=1).reset_index(drop=True)  # shuffle the dataset

df = df[['Text', 'Score']].copy()
df = df[df['Score'] != 3].reset_index(drop=True)  # remove neutral reviews
df['Sentiment'] = df['Score'].apply(lambda x: 1 if x > 3 else 0)  # convert scores to binary sentiment

df = df[:10000]  # reduce the dataset to 10,000 rows

X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Sentiment'], test_size=0.2, random_state=42)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import pipeline, RobertaTokenizerFast
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


df = pd.read_csv('Reviews.csv')
df = df.sample(frac=1).reset_index(drop=True)  # shuffle the dataset

df = df[['Text', 'Score']]
df = df[df['Score'] != 3]  # remove neutral reviews
df['Sentiment'] = df['Score'].apply(lambda x: 1 if x > 3 else 0)  # convert scores to binary sentiment

df = df[:10000]  # reduce the dataset to 10,000 rows

X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Sentiment'], test_size=0.2, random_state=42)

# Extract features with Roberta
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
feature_extractor = pipeline('feature-extraction', model='roberta-base', tokenizer=tokenizer)

try:
    X_train_features = [feature_extractor(text, max_length=128, padding="max_length", truncation=True)[0][0] for text in X_train]
    X_test_features = [feature_extractor(text, max_length=128, padding="max_length", truncation=True)[0][0] for text in X_test]

    # Train logistic regression classifier
    clf = LogisticRegression(random_state=42)
    clf.fit(X_train_features, y_train)

    # Evaluate on validation set
    y_pred = clf.predict(X_test_features)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

except Exception as e:
    print(f"Error: {e}")

In [None]:
# Next, we'll use the HuggingFace pipeline to extract features from the text using the 'roberta-base' model and create a logistic regression classifier:
# Extract features with Roberta
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
feature_extractor = pipeline('feature-extraction', model='roberta-base', tokenizer=tokenizer, padding=True, max_length=128)

X_train_features = [feature_extractor(text, max_length=128)[0][0] for text in X_train]
X_test_features = [feature_extractor(text, max_length=128)[0][0] for text in X_test]

# Train logistic regression classifier
clf = LogisticRegression(random_state=42)
clf.fit(X_train_features, y_train)

# Evaluate on validation set
y_pred = clf.predict(X_test_features)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
# For part (b), we'll train an end-to-end classifier using the HuggingFace 'trainer' function:
from transformers import RobertaForSequenceClassification, TrainingArguments, Trainer

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=1e-4,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    weight_decay=0.0,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=X_train,
    eval_dataset=X_test
)

trainer.train()

# Evaluate on validation set
predictions = trainer.predict(X_test)
y_pred = predictions.predictions.argmax(-1)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:
# For part (c), we'll try three different sets of hyperparameters:
# Hyperparameters for set 1
learning_rate_1 = 5e-5
num_epochs_1 = 2
batch_size_1 = 32

# Hyperparameters for set 2
learning_rate_2 = 1e-5
num_epochs_2 = 1
batch_size_2 = 64

# Hyperparameters for set 3
learning_rate_3 = 2e-5
num_epochs_3 = 3
batch_size_3 = 16

# Train models with different hyperparameters
model_1 = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
training_args_1 = TrainingArguments
