In [212]:
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [213]:
df = pd.read_csv('netflix_reviews.csv')
df = pd.concat([df['content'], df['score']], axis=1)
print('Our data set shape is:', df.shape)

Our data set shape is: (108494, 2)


In [214]:
df = df[:20000]

In [215]:
df.shape

(20000, 2)

In [216]:
df.head()

Unnamed: 0,content,score
0,I can't log in I have to pay it but I pay it s...,1
1,I love Netflix is so good I love it so much,5
2,Good,3
3,This was good when people could actually use i...,1
4,"Was working perfectly up until last month, it ...",1


In [217]:
y = df['score']
content = df['content']

### Lowering our content column

In [218]:
content = content.str.lower()

### Stopword Removal and Steamming

In [219]:
stop_words = set(stopwords.words('english'))
stop_words.add('.')
stop_words.remove('not')
pattern = r'\b[a-zA-Z]'
stemmer = PorterStemmer()
def removing_stop_words_from_content(content):
    content_list = content.split()
    filtered_content = [token for token in content_list if re.match(pattern, token)]
    removed_stop_word_content = [stemmer.stem(word) for word in filtered_content if word not in stop_words]
    return ' '.join(removed_stop_word_content)

In [220]:
content = content.apply(removing_stop_words_from_content)

In [221]:
vectorizer = CountVectorizer(max_features=600, binary=True)
binary_matrix = vectorizer.fit_transform(content).toarray()
word_columns = vectorizer.get_feature_names_out()
word_df = pd.DataFrame(binary_matrix, columns=word_columns)

In [222]:
word_df

Unnamed: 0,abil,abl,absolut,access,account,actual,ad,add,adjust,ads,...,worse,worst,worth,would,wrong,year,years,yet,you,youtub
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [223]:
X_train, X_test, y_train, y_test = train_test_split(word_df, y, test_size=0.2, random_state=42)

In [224]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [225]:
y_pred = classifier.predict(X_test)

from sklearn.metrics import  accuracy_score
accuracy_score(y_test, y_pred) * 100

44.6

In [226]:
from sklearn.linear_model import LogisticRegression

log_reg_model = LogisticRegression(max_iter=350)

log_reg_model.fit(X_train, y_train)

log_reg_pred = log_reg_model.predict(X_test)

accuracy_score(y_test, log_reg_pred) * 100

58.575