In [1]:
import pandas as pd

# Clean the Data

## IMDB

In [2]:
df_imdb = pd.read_csv('The Raw Data/IMDB Dataset.csv')

In [3]:
df_imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Amazon

In [5]:
file_path = "Data Resource/test.ft.txt"

with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

data = []
for line in lines:
    parts = line.strip().split(" ", 1)  
    label = int(parts[0].replace("__label__", "")) 
    text = parts[1] if len(parts) > 1 else "" 
    data.append([text, label])


df_amazon = pd.DataFrame(data, columns=["review", "sentiment"])

df_amazon["sentiment"] = df_amazon["sentiment"].map({1: "negative", 2: "positive"})


df_amazon.head()


Unnamed: 0,review,sentiment
0,Great CD: My lovely Pat has one of the GREAT v...,positive
1,One of the best game music soundtracks - for a...,positive
2,Batteries died within a year ...: I bought thi...,negative
3,"works fine, but Maha Energy is better: Check o...",positive
4,Great for the non-audiophile: Reviewed quite a...,positive


## X

In [6]:
file_path = "The Raw Data/training.1600000.processed.noemoticon.csv"  


columns = ["sentiment", "id", "date", "query", "username", "review"]


df_X = pd.read_csv(file_path, encoding="latin-1", header=None, names=columns)


df_X = df_X[["sentiment", "review"]] 

df_X["sentiment"] = df_X["sentiment"].map({0: "negative", 4: "positive"})

df_X = df_X[["review", "sentiment"]]

df_X.head()


Unnamed: 0,review,sentiment
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",negative
1,is upset that he can't update his Facebook by ...,negative
2,@Kenichan I dived many times for the ball. Man...,negative
3,my whole body feels itchy and like its on fire,negative
4,"@nationwideclass no, it's not behaving at all....",negative


In [7]:
from sklearn.utils import shuffle

df_X = shuffle(df_X, random_state=42)

df_X = df_X.reset_index(drop=True)

df_X.head()

Unnamed: 0,review,sentiment
0,@chrishasboobs AHHH I HOPE YOUR OK!!!,negative
1,"@misstoriblack cool , i have no tweet apps fo...",negative
2,@TiannaChaos i know just family drama. its la...,negative
3,School email won't open and I have geography ...,negative
4,upper airways problem,negative


In [8]:
df_40w = df_X.sample(n=400000, random_state=42)

# tokenization

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# 初始化处理工具
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# 预处理函数
def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'<.*?>', '', text) 
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words] 
    words = [lemmatizer.lemmatize(word, pos="v") for word in words]
    return " ".join(words)

df_imdb["review"] = df_imdb["review"].apply(preprocess_text)

df_amazon["review"] = df_amazon["review"].apply(preprocess_text)

df_40w["review"] = df_40w["review"].apply(preprocess_text)

print("IMDB")
print(df_imdb.head())

print("Amazon")
print(df_amazon.head())

print("X")
print(df_40w.head())

IMDB
                                              review sentiment
0  one reviewers mention watch oz episode youll h...  positive
1  wonderful little production film technique una...  positive
2  think wonderful way spend time hot summer week...  positive
3  basically theres family little boy jake think ...  negative
4  petter matteis love time money visually stun f...  positive
Amazon
                                              review sentiment
0  great cd lovely pat one great voice generation...  positive
1  one best game music soundtracks game didnt rea...  positive
2  batteries die within year buy charger jul work...  negative
3  work fine maha energy better check maha energy...  positive
4  great nonaudiophile review quite bite combo pl...  positive
X
                                                   review sentiment
541200              singstar nite last nitemy throat hurt  negative
750       ready work sad time mmm toast httptwitpiccomjhc  negative
766711                    

# Baseline on amazon review dataset

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [28]:
X_train = df_amazon["review"]
y_train = df_amazon["sentiment"]

X_test = df_imdb["review"]
y_test = df_imdb["sentiment"]

In [40]:
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,3))
X_train = vectorizer.fit_transform(df_amazon['review'])
X_test = vectorizer.transform(df_imdb['review'])

In [45]:
log_reg = LogisticRegression(max_iter=2000)
log_reg.fit(X_train, y_train)

In [46]:
y_pred_log = log_reg.predict(X_test)

In [47]:
accuracy_log = accuracy_score(y_test, y_pred_log)
report_log = classification_report(y_test, y_pred_log, output_dict=True)

In [48]:
report_log_df = pd.DataFrame(report_log).transpose()

report_log_df

Unnamed: 0,precision,recall,f1-score,support
0,0.85581,0.90264,0.878601,25000.0
1,0.897004,0.84792,0.871772,25000.0
accuracy,0.87528,0.87528,0.87528,0.87528
macro avg,0.876407,0.87528,0.875187,50000.0
weighted avg,0.876407,0.87528,0.875187,50000.0


# Base on X comments

In [19]:
X_train = df_amazon["review"]
y_train = df_amazon["sentiment"]

X_test = df_40w["review"]
y_test = df_40w["sentiment"]

In [21]:
vectorizer = CountVectorizer(ngram_range=(1,1), max_features=2000)
X_train_tf = vectorizer.fit_transform(X_train)
X_test_tf = vectorizer.transform(X_test)

In [22]:
log_reg = LogisticRegression(max_iter=500)
log_reg.fit(X_train_tf, y_train)

In [23]:
y_pred_log = log_reg.predict(X_test_tf)

In [24]:
accuracy_log = accuracy_score(y_test, y_pred_log)
report_log = classification_report(y_test, y_pred_log, output_dict=True)

In [25]:
report_log_df = pd.DataFrame(report_log).transpose()

report_log_df

Unnamed: 0,precision,recall,f1-score,support
negative,0.600194,0.668102,0.63233,200022.0
positive,0.625666,0.554861,0.58814,199978.0
accuracy,0.611487,0.611487,0.611487,0.611487
macro avg,0.61293,0.611481,0.610235,400000.0
weighted avg,0.612929,0.611487,0.610237,400000.0
