In [1]:
import pandas as pd

In [2]:
df_X = pd.read_csv('processed data/X.csv')

# Improvement to preprocessing

In [3]:
df_X

Unnamed: 0,review,sentiment
0,@chrishasboobs AHHH I HOPE YOUR OK!!!,negative
1,"@misstoriblack cool , i have no tweet apps fo...",negative
2,@TiannaChaos i know just family drama. its la...,negative
3,School email won't open and I have geography ...,negative
4,upper airways problem,negative
...,...,...
399995,@brykins Splendid! I was told I looked like a ...,negative
399996,@herbadmother I'm so sorry! that IS sad,negative
399997,@JosieStingray Sounds like Eddie Murphy is coo...,positive
399998,http://twitpic.com/4incl - The tiny Porter pla...,negative


In [4]:
import os
import re
import emoji
import nltk
import pandas as pd
import multiprocessing
from textblob import TextBlob
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

In [5]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [6]:
import textblob.download_corpora as download_corpora

In [7]:
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [8]:
ABBREVIATIONS = {
    "u": "you", "r": "are", "ur": "your", "n": "and", "luv": "love",
    "im": "I'm", "wanna": "want to", "gonna": "going to", "btw": "by the way",
    "idk": "I don't know", "smh": "shaking my head", "lol": "laugh out loud",
    "omg": "oh my god", "tbh": "to be honest", "brb": "be right back",
    "lmao": "laughing my ass off"
}

In [None]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def correct_spelling(text):
    words = text.split()
    if len(words) <= 5:
        return str(TextBlob(text).correct())
    return text

def preprocess_tweet(text):
    if pd.isna(text):  
        return []

    text = emoji.demojize(text)
    
    text = re.sub(r"http\S+|www\S+", "", text)
    
    text = re.sub(r"#(\w+)", r"\1", text)
    
    words = text.split()
    words = [ABBREVIATIONS.get(word, word) for word in words]
    text = " ".join(words)

    text = correct_spelling(text)

    tokens = tokenizer.tokenize(text)

    tokens = [
        lemmatizer.lemmatize(word, get_wordnet_pos(word))
        for word in tokens if word.isalpha() and word not in stop_words
    ]
    
    return tokens

tqdm.pandas()

In [133]:
df_X["tokenized_review"] = df_X["review"].progress_apply(preprocess_tweet)

100%|██████████| 400000/400000 [1:19:06<00:00, 84.28it/s]  


In [12]:
df_amazon = pd.read_csv('processed data/amazon.csv')

In [13]:
df_amazon.head()

Unnamed: 0,review,sentiment
0,great cd lovely pat one great voices generatio...,positive
1,one best game music soundtracks game didnt rea...,positive
2,batteries died within year bought charger jul ...,negative
3,works fine maha energy better check maha energ...,positive
4,great nonaudiophile reviewed quite bit combo p...,positive


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import numpy as np
import joblib

df_amazon = df_amazon.dropna(subset=["review"])

tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = tfidf_vectorizer.fit_transform(df_amazon["review"])

joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")


['tfidf_vectorizer.pkl']

In [None]:
from gensim.models import FastText

model_ft = FastText(sentences=df_amazon["review"].apply(str.split), vector_size=300, window=10, min_count=5, workers=4)

joblib.dump(model_ft, "fasttext_model.pkl")

['fasttext_model.pkl']

In [None]:
def get_sentence_vector(tokens):
    return np.mean([model_ft.wv[word] for word in tokens if word in model_ft.wv] or [np.zeros(300)], axis=0)

X_train_ft = np.vstack(df_amazon["review"].apply(lambda x: get_sentence_vector(str(x).split())))


In [None]:
X_train_combined = hstack([X_train_tfidf, X_train_ft])

# XGboost

In [None]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(
    X_train_combined, df_amazon["sentiment"].map({"positive": 1, "negative": 0}), test_size=0.2, random_state=42
)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# def objective(trial):
#     params = {
#         "n_estimators": trial.suggest_int("n_estimators", 50, 500, step=50),
#         "max_depth": trial.suggest_int("max_depth", 3, 10),
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
#         "subsample": trial.suggest_float("subsample", 0.6, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
#         "gamma": trial.suggest_float("gamma", 0, 10),
#         "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
#         "eval_metric": "logloss"
#     }
    
#     # 训练 XGBoost
#     model = XGBClassifier(**params)
#     model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    
#     # 预测测试集
#     y_pred = model.predict(X_test)
#     accuracy = accuracy_score(y_test, y_pred)
    
#     return accuracy 

# study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=10, n_jobs=4) 

# best_params = study.best_params
# print("最佳超参数:", best_params)

[I 2025-03-07 22:51:05,602] A new study created in memory with name: no-name-f2e7f445-82c1-4790-bb5c-ef018fc571f7
[I 2025-03-07 22:56:22,976] Trial 2 finished with value: 0.8406 and parameters: {'n_estimators': 50, 'max_depth': 8, 'learning_rate': 0.11264082324782776, 'subsample': 0.7343986371114865, 'colsample_bytree': 0.66566162186621, 'gamma': 5.913858458647714, 'min_child_weight': 2}. Best is trial 2 with value: 0.8406.
[I 2025-03-07 23:00:52,924] Trial 0 finished with value: 0.8109 and parameters: {'n_estimators': 100, 'max_depth': 7, 'learning_rate': 0.02699383083086185, 'subsample': 0.6970975399704095, 'colsample_bytree': 0.7146419801195466, 'gamma': 8.023962060852345, 'min_child_weight': 3}. Best is trial 2 with value: 0.8406.
[I 2025-03-07 23:18:28,505] Trial 3 finished with value: 0.8558 and parameters: {'n_estimators': 350, 'max_depth': 4, 'learning_rate': 0.051278262523794686, 'subsample': 0.9409987492379168, 'colsample_bytree': 0.8085812121246679, 'gamma': 4.22615912970077

最佳超参数: {'n_estimators': 450, 'max_depth': 8, 'learning_rate': 0.2469059744142288, 'subsample': 0.8900172778036246, 'colsample_bytree': 0.7135880444264231, 'gamma': 5.839036635890301, 'min_child_weight': 10}


In [None]:
best_xgb = XGBClassifier(
    n_estimators=450,
    max_depth=8,
    learning_rate=0.2469,
    subsample=0.8900,
    colsample_bytree=0.7136,
    gamma=5.8390,
    min_child_weight=10,
    eval_metric="logloss"
)

best_xgb.fit(X_train, y_train)

y_pred_best = best_xgb.predict(X_test)

from sklearn.metrics import classification_report
print("TF-IDF + FastText + XGBoost Performance on Amazon:")
print(classification_report(y_test, y_pred_best))

joblib.dump(best_xgb, "optimized_tfidf_fasttext_xgboost.pkl")

TF-IDF + FastText + XGBoost Performance on Amazon:
              precision    recall  f1-score   support

           0       0.89      0.89      0.89     40007
           1       0.89      0.89      0.89     39993

    accuracy                           0.89     80000
   macro avg       0.89      0.89      0.89     80000
weighted avg       0.89      0.89      0.89     80000



['optimized_tfidf_fasttext_xgboost.pkl']

In [24]:
df_X_tokenized = pd.read_csv('processed data/X_tokenized.csv')

In [25]:
df_X_tokenized

Unnamed: 0,review,sentiment,tokenized_review
0,Singstar nite last nite....my throat hurts,negative,"['singstar', 'nite', 'last', 'nite', 'throat',..."
1,Up and ready for work sad times mmm toast htt...,negative,"['ready', 'work', 'sad', 'time', 'mmm', 'toast']"
2,Watching another movie,positive,"['watch', 'another', 'movie']"
3,Listening to Ed Schultz interview of Mich. Gov...,negative,"['listen', 'ed', 'schultz', 'interview', 'mich..."
4,@timothyh2o last time his bro was making up st...,negative,"['last', 'time', 'bro', 'make', 'story', 'coul..."
...,...,...,...
399995,watching King of the Hill right now... hilario...,positive,"['watch', 'king', 'hill', 'right', 'hilarious']"
399996,is excited for the weekend and what's in store..,positive,"['excite', 'weekend', 'store']"
399997,"@i_am_TC k, and you have fun with what you're ...",positive,"['k', 'fun']"
399998,off for a bit . going to try and rest my poor ...,negative,"['bit', 'go', 'try', 'rest', 'poor', 'head']"


In [None]:
tfidf_vectorizer = joblib.load("tfidf_vectorizer.pkl")
model_ft = joblib.load("fasttext_model.pkl")
best_xgb = joblib.load("optimized_tfidf_fasttext_xgboost.pkl")

X_twitter_tfidf = tfidf_vectorizer.transform(df_X_tokenized["tokenized_review"])

def get_sentence_vector(tokens):
    return np.mean([model_ft.wv[word] for word in tokens if word in model_ft.wv] or [np.zeros(300)], axis=0)

X_twitter_ft = np.vstack(df_X_tokenized["tokenized_review"].apply(lambda x: get_sentence_vector(str(x).split())))

X_twitter_combined = hstack([X_twitter_tfidf, X_twitter_ft])

In [None]:
df_X_tokenized["prediction"] = best_xgb.predict(X_twitter_combined)

df_X_tokenized["predict_sentiment"] = df_X_tokenized["prediction"].map({0: "negative", 1: "positive"})

print(df_X_tokenized["predict_sentiment"].value_counts())


predict_sentiment
positive    299073
negative    100927
Name: count, dtype: int64


In [None]:
from sklearn.metrics import classification_report
print("XGBoost performance on  Twitter:")
print(classification_report(df_X_tokenized["sentiment"], df_X_tokenized["predict_sentiment"]))

XGBoost performance on  Twitter:
              precision    recall  f1-score   support

    negative       0.64      0.32      0.43    200022
    positive       0.55      0.82      0.65    199978

    accuracy                           0.57    400000
   macro avg       0.59      0.57      0.54    400000
weighted avg       0.59      0.57      0.54    400000



# LightGBM

In [29]:
from lightgbm import LGBMClassifier

model_lgb = LGBMClassifier(n_estimators=500, max_depth=10, learning_rate=0.05)
model_lgb.fit(X_train, y_train)


df_X_tokenized["prediction_lgb"] = model_lgb.predict(X_twitter_combined)
df_X_tokenized["predict_sentiment_lgb"] = df_X_tokenized["prediction_lgb"].map({0: "negative", 1: "positive"})

print("LightGBM performance on Twitter:")
print(classification_report(df_X_tokenized["sentiment"], df_X_tokenized["predict_sentiment_lgb"]))

[LightGBM] [Info] Number of positive: 160006, number of negative: 159993
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.613604 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 911469
[LightGBM] [Info] Number of data points in the train set: 319999, number of used features: 5300
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500020 -> initscore=0.000081
[LightGBM] [Info] Start training from score 0.000081




LightGBM performance on Twitter:
              precision    recall  f1-score   support

    negative       0.66      0.27      0.38    200022
    positive       0.54      0.86      0.67    199978

    accuracy                           0.57    400000
   macro avg       0.60      0.57      0.52    400000
weighted avg       0.60      0.57      0.52    400000



# LGR

In [30]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=500)
log_reg.fit(X_train, y_train)

In [31]:
df_X_tokenized["prediction_lgr"] = log_reg.predict(X_twitter_combined)
df_X_tokenized["predict_sentiment_lgr"] = df_X_tokenized["prediction_lgr"].map({0: "negative", 1: "positive"})

print("LGR performance on Twitter:")
print(classification_report(df_X_tokenized["sentiment"], df_X_tokenized["predict_sentiment_lgr"]))

LGR performance on Twitter:
              precision    recall  f1-score   support

    negative       0.59      0.48      0.53    200022
    positive       0.56      0.67      0.61    199978

    accuracy                           0.58    400000
   macro avg       0.58      0.58      0.57    400000
weighted avg       0.58      0.58      0.57    400000

