In [23]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from nltk import word_tokenize, WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.models import word2vec
import string
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [11]:
df = pd.read_csv('data_text/cleaned_reviews.csv')
df

Unnamed: 0,sentiments,cleaned_review,cleaned_review_length,review_score
0,positive,i wish would have gotten one earlier love it a...,19,5.0
1,neutral,i ve learned this lesson again open the packag...,88,1.0
2,neutral,it is so slow and lags find better option,9,2.0
3,neutral,roller ball stopped working within months of m...,12,1.0
4,neutral,i like the color and size but it few days out ...,21,1.0
...,...,...,...,...
17335,positive,i love this speaker and love can take it anywh...,30,5.0
17336,positive,i use it in my house easy to connect and loud ...,13,4.0
17337,positive,the bass is good and the battery is amazing mu...,41,5.0
17338,positive,love it,2,5.0


In [12]:
ps = PorterStemmer() 

def preprocessing(sentence, lemma=True, stemming=True, tokenize=True, stopword=True):

    sentence = str(sentence).lower()
    sentence = re.sub(f'[{re.escape(string.punctuation)}]', ' ', sentence)
    sentence = re.sub(' +', ' ', sentence)

    if tokenize:
        words = word_tokenize(sentence)
    else:
        words = sentence.split()

    if stopword:
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]

    if stemming:
        words = [ps.stem(word) for word in words]

    if lemma:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
    
    return ' '.join(words), words

df['processed_text'] = ""
df['words_list'] = None

for i, row in df.iterrows():
    processed_sentence, words_list = preprocessing(row['cleaned_review'])
    df.at[i, 'processed_text'] = processed_sentence
    df.at[i, 'words_list'] = words_list

In [13]:
df

Unnamed: 0,sentiments,cleaned_review,cleaned_review_length,review_score,processed_text,words_list
0,positive,i wish would have gotten one earlier love it a...,19,5.0,wish would gotten one earlier love make work l...,"[wish, would, gotten, one, earlier, love, make..."
1,neutral,i ve learned this lesson again open the packag...,88,1.0,learn lesson open packag use product right awa...,"[learn, lesson, open, packag, use, product, ri..."
2,neutral,it is so slow and lags find better option,9,2.0,slow lag find better option,"[slow, lag, find, better, option]"
3,neutral,roller ball stopped working within months of m...,12,1.0,roller ball stop work within month minim use p...,"[roller, ball, stop, work, within, month, mini..."
4,neutral,i like the color and size but it few days out ...,21,1.0,like color size day return period hold charg,"[like, color, size, day, return, period, hold,..."
...,...,...,...,...,...,...
17335,positive,i love this speaker and love can take it anywh...,30,5.0,love speaker love take anywher charg phone wor...,"[love, speaker, love, take, anywher, charg, ph..."
17336,positive,i use it in my house easy to connect and loud ...,13,4.0,use hous easi connect loud clear music,"[use, hous, easi, connect, loud, clear, music]"
17337,positive,the bass is good and the battery is amazing mu...,41,5.0,bass good batteri amaz much better charg thing...,"[bass, good, batteri, amaz, much, better, char..."
17338,positive,love it,2,5.0,love,[love]


In [14]:
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['processed_text'])

tfidf_representation = X_tfidf.toarray()

In [15]:
X = tfidf_representation

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, df.sentiments, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=42)

In [26]:
lg = LogisticRegression(max_iter=1500, solver='lbfgs', class_weight='balanced')

In [41]:
model = lg.fit(X_train, y_train)

In [42]:
y_val_pred = model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
f1_micro = f1_score(y_val, y_val_pred, average='micro')
f1_macro = f1_score(y_val, y_val_pred, average='macro')

y_val_proba = model.predict_proba(X_val)
auroc = roc_auc_score(y_val, y_val_proba, multi_class='ovr')

In [43]:
print(f'Validation Accuracy: {val_accuracy}')
print(f'F1 Micro: {f1_micro}')
print(f'F1 Macro: {f1_macro}')
print(f'AUROC: {auroc}')

Validation Accuracy: 0.7773933102652826
F1 Micro: 0.7773933102652826
F1 Macro: 0.7144344758232339
AUROC: 0.9124446741393308
