Without Pre-processing

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
reviews_df = pd.read_csv('../_data/Reviews.csv')
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(reviews_df['Text'])

In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import sklearn.model_selection as skms

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = skms.train_test_split(tfidf_matrix, reviews_df['Score'], test_size=0.2, random_state=42)

# Multinomial Naive Bayes
nb = MultinomialNB()
naive_bayes = nb.fit(X_train, y_train)
predicted = naive_bayes.predict(X_test)

# Metrics
print(metrics.confusion_matrix(y_test, predicted))
print(metrics.classification_report(y_test, predicted))

[[  923     0     1    13  9389]
 [   91     5     1    11  5747]
 [   44     1    11    26  8403]
 [   26     4     4   156 15933]
 [   42    15    10    34 72801]]
              precision    recall  f1-score   support

           1       0.82      0.09      0.16     10326
           2       0.20      0.00      0.00      5855
           3       0.41      0.00      0.00      8485
           4       0.65      0.01      0.02     16123
           5       0.65      1.00      0.79     72902

    accuracy                           0.65    113691
   macro avg       0.55      0.22      0.19    113691
weighted avg       0.62      0.65      0.52    113691



With Pre-processing

In [6]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize into words
    tokens = word_tokenize(text)
    
    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    return ' '.join(filtered_tokens)  # Joining tokens into a single string

[nltk_data] Downloading package punkt to /home/lina/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/lina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# Preprocess text data
text_data_preprocessed = [preprocess_text(text) for text in reviews_df['Text']]

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(text_data_preprocessed)

In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = skms.train_test_split(tfidf_matrix, reviews_df['Score'], test_size=0.2, random_state=42)

# Multinomial Naive Bayes
nb = MultinomialNB()
naive_bayes = nb.fit(X_train, y_train)
predicted = naive_bayes.predict(X_test)

# Metrics
print(metrics.confusion_matrix(y_test, predicted))
print(metrics.classification_report(y_test, predicted))

[[  458     0     0     3  9865]
 [   26     8     0     3  5818]
 [   19     1    10    10  8445]
 [    5     0     2   138 15978]
 [   15     6     3    20 72858]]
              precision    recall  f1-score   support

           1       0.88      0.04      0.08     10326
           2       0.53      0.00      0.00      5855
           3       0.67      0.00      0.00      8485
           4       0.79      0.01      0.02     16123
           5       0.64      1.00      0.78     72902

    accuracy                           0.65    113691
   macro avg       0.70      0.21      0.18    113691
weighted avg       0.68      0.65      0.51    113691

