# Amazon Fine Food Reviews

In [1]:
# Amazon Fine Food Reviews
import pandas as pd
df = pd.read_csv("Reviews.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568438 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


In [51]:
# 选择1000条作为样本
data = df[:1000]
# 只保留data 的Text 和 Score
data = data[['Text','Score']]
# data 的 Score 如果大于3，那么就是正面评价，否则就是负面评价
data['sentiment'] = data['Score'].apply(lambda rating : +1 if rating >= 3 else 0)
data.head(5)

Unnamed: 0,Text,Score,sentiment
0,I have bought several of the Vitality canned d...,5,1
1,Product arrived labeled as Jumbo Salted Peanut...,1,0
2,This is a confection that has been around a fe...,4,1
3,If you are looking for the secret ingredient i...,2,0
4,Great taffy at a great price. There was a wid...,5,1


In [30]:
# 划分测试集合训练集
from sklearn.model_selection import train_test_split
train_text, test_text, train_labels, test_labels = train_test_split(data['Text'], data['sentiment'], random_state=2020)
print(train_labels.value_counts())
print(test_labels.value_counts())

1    642
0    108
Name: sentiment, dtype: int64
1    213
0     37
Name: sentiment, dtype: int64


# roberta-base

In [1]:
import torch
import transformers

# 加载 RoBERTa 预训练模型和 tokenizer
model = transformers.RobertaForSequenceClassification.from_pretrained('roberta-base')
tokenizer = transformers.RobertaTokenizer.from_pretrained('roberta-base')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [22]:
# 将 data.Text 转为数组
train_text_data = train_text.values
print(train_text_data.shape)

# 将train_text_data转成可放入 tokenizer 的格式
train_text_token = tokenizer.batch_encode_plus(train_text.values.tolist(), max_length=128, padding=True, truncation=True)

print(train_text_token['input_ids'])
print(train_text_token['attention_mask'])


(750,)
[[0, 100, 348, 3584, 4295, 9, 5565, 9, 42, 27974, 5113, 3344, 36, 5488, 16, 10, 372, 1152, 6, 30, 5, 169, 322, 1437, 152, 16, 5, 1609, 425, 38, 348, 450, 4, 1437, 14727, 8465, 5723, 42, 6880, 8, 24, 18, 57, 15, 1392, 89, 13, 68, 466, 4, 2831, 36, 6025, 18, 1563, 1932, 6, 350, 6, 25, 38, 437, 11, 896, 322, 1437, 85, 18, 190, 7246, 15, 5643, 18, 7803, 18, 308, 998, 6, 8, 51, 214, 5, 138, 2183, 42, 1152, 149, 1645, 4, 286, 10, 80, 12, 12486, 51, 197, 28, 6489, 68, 844, 50, 540, 4, 1437, 653, 2029, 116, 1437, 38, 802, 1645, 21, 147, 47, 115, 465, 2695, 850, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1], [0, 100, 21, 385, 493, 3807, 26427, 11, 5, 12117, 8, 18632, 9, 42, 3344, 4, 1437, 38, 2333, 101, 144, 9, 5, 6207, 27525, 383, 38, 33, 1381, 6, 53, 21, 385, 493, 3807, 26427, 11, 42, 2167, 65, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [23]:
# 将数据包装成 PyTorch tensor 格式
train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_text_token['input_ids']),
                                               torch.tensor(train_text_token['attention_mask']),
                                               torch.tensor(train_labels.to_list()))

In [24]:
# 定义训练参数和优化器
batch_size = 32
epochs = 5
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5)

# 训练
model.train()
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
for epoch in range(epochs):
  for batch in train_loader:
    input_ids, attention_mask, labels = tuple(t for t in batch)
    optimizer.zero_grad()
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    logits = outputs.logits
    loss.backward()
    optimizer.step()

In [27]:
# 验证
model.eval()
with torch.no_grad():
    val_encodings = tokenizer(test_text.values.tolist(), truncation=True, padding=True, max_length=128)
    val_dataset = torch.utils.data.TensorDataset(torch.tensor(val_encodings['input_ids']),
                                               torch.tensor(val_encodings['attention_mask']),
                                               torch.tensor(test_labels.tolist()))
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size)
    num_correct = 0
    num_total = 0
    for batch in val_loader:
        input_ids, attention_mask, labels = tuple(t for t in batch)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1)
        num_correct += (predictions == labels).sum().item()
        num_total += labels.size(0)
    accuracy = num_correct / num_total
    print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.91


# svm with tfidf

In [31]:
# 对 train_text 计算 tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
train_tfidf = tfidf.fit_transform(train_text)
print(train_tfidf.shape)


(750, 5217)


In [52]:
# 查看 train_tfidf 数据
print(train_tfidf[0])

  (0, 38)	0.13682729464783203
  (0, 127)	0.15357844510523774
  (0, 283)	0.1605976553546986
  (0, 297)	0.06466872351269028
  (0, 378)	0.06206272062591451
  (0, 464)	0.08869159472436675
  (0, 467)	0.18747810879012877
  (0, 507)	0.06774212006009211
  (0, 535)	0.09088684534603532
  (0, 649)	0.10204993504822282
  (0, 744)	0.08507824613793524
  (0, 774)	0.18747810879012877
  (0, 775)	0.18747810879012877
  (0, 817)	0.16268335636747097
  (0, 872)	0.17650625605622064
  (0, 877)	0.12385008623566836
  (0, 1031)	0.12023673764923691
  (0, 1042)	0.17650625605622064
  (0, 1140)	0.10148023151357564
  (0, 1458)	0.1577497499205594
  (0, 1478)	0.16872160265446753
  (0, 1645)	0.09127068041125742
  (0, 1820)	0.08633707396434587
  (0, 1891)	0.10709893577826265
  (0, 1897)	0.08958234216856466
  :	:
  (0, 3585)	0.14945697578185665
  (0, 3627)	0.11511586458375395
  (0, 3691)	0.09829303218142064
  (0, 3936)	0.14996509651880627
  (0, 4036)	0.14134776454491904
  (0, 4044)	0.14996509651880627
  (0, 4123)	0.1151158

In [37]:
# 将train_tfidf带入svm模型中训练模型
from sklearn import svm
clf = svm.SVC()
model = clf.fit(train_tfidf, train_labels)


In [38]:
# 通过 model 模型对 test_text 进行预测
test_tfidf = tfidf.transform(test_text)
print(test_tfidf.shape)
predictions = model.predict(test_tfidf)


(250, 5217)


In [39]:
# 对预测结果进行性能评估
from sklearn.metrics import accuracy_score
print(accuracy_score(test_labels, predictions))


0.852


# Vader 情感词典

In [53]:
# 使用 vader 对 train_text 进行情感分析
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
train_text_data = train_text.values
train_text_data = [sid.polarity_scores(text) for text in train_text_data]
print(train_text_data[0])


{'neg': 0.0, 'neu': 0.893, 'pos': 0.107, 'compound': 0.802}


In [48]:
# 使用 vader 对 test_text 进行情感分析，并评估准确率
test_text_data = test_text.values
test_text_data = [sid.polarity_scores(text) for text in test_text_data]
predictions = [1 if text['compound'] >= 0 else 0 for text in test_text_data]
from sklearn.metrics import accuracy_score
print(accuracy_score(test_labels, predictions))

0.86


# TextBlob

In [49]:
# 使用 TextBlob 对 test_text 进行情感分析，并评估准确率
from textblob import TextBlob
test_text_data = test_text.values
test_text_data = [TextBlob(text).sentiment.polarity for text in test_text_data]
predictions = [1 if text >= 0 else 0 for text in test_text_data]
from sklearn.metrics import accuracy_score
print(accuracy_score(test_labels, predictions))


0.828
