In [20]:
import re
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df = pd.read_csv('netflix_reviews.csv')
df = pd.concat([df['content'], df['score']], axis=1)
print('Our data set shape is:', df.shape)

Our data set shape is: (108494, 2)


In [3]:
df = df[:35000]

In [4]:
df.shape

(35000, 2)

In [5]:
df.head()

Unnamed: 0,content,score
0,I can't log in I have to pay it but I pay it s...,1
1,I love Netflix is so good I love it so much,5
2,Good,3
3,This was good when people could actually use i...,1
4,"Was working perfectly up until last month, it ...",1


In [6]:
y = df['score']
X = df['content']

### Lowering our content column

In [7]:
X = X.str.lower()

### Stopword Removal and Steamming

In [8]:
stop_words = set(stopwords.words('english'))
stop_words.add('.')
pattern = r'\b[a-zA-Z]'
stemmer = PorterStemmer()
def removing_stop_words_from_content(content):
    tokenized_content = word_tokenize(content)
    filtered_tokens = [token for token in tokenized_content if re.match(pattern, token)]
    removed_stop_word_content = [stemmer.stem(word) for word in filtered_tokens if word not in stop_words]
    return ' '.join(removed_stop_word_content)

In [9]:
X = X.apply(removing_stop_words_from_content)

In [10]:
vectorizer = CountVectorizer(binary=True)
vectorizer.fit(X)
binary_matrix = vectorizer.transform(X).toarray()
word_columns = vectorizer.get_feature_names_out()
word_df = pd.DataFrame(binary_matrix, columns=word_columns)

In [11]:
word_df.shape

(35000, 16869)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(word_df, y, test_size=0.2, random_state=42)

In [13]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [14]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5814285714285714


In [17]:
df.columns

Index(['content', 'score'], dtype='object')

In [58]:
tokenizer = Tokenizer(num_words=40000)  # Limit vocabulary size to 10,000 words
tokenizer.fit_on_texts(df['content'])
sequences = tokenizer.texts_to_sequences(df['content'])

In [59]:
max_length = 100  # Adjust as needed
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

In [60]:
# Convert scores to categorical labels
num_classes = 6  # Scores range from 0 to 5
labels = tf.keras.utils.to_categorical(df['score'], num_classes=num_classes)

In [61]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [62]:
# Define the neural network architecture
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=40000, output_dim=16, input_length=max_length),
    tf.keras.layers.LSTM(units=64),
    tf.keras.layers.Dense(units=num_classes, activation='softmax')
])

In [63]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [64]:
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

Epoch 1/20
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 26ms/step - accuracy: 0.4662 - loss: 1.4695 - val_accuracy: 0.4796 - val_loss: 1.4170
Epoch 2/20
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 25ms/step - accuracy: 0.4660 - loss: 1.4296 - val_accuracy: 0.4796 - val_loss: 1.4161
Epoch 3/20
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 26ms/step - accuracy: 0.4698 - loss: 1.4104 - val_accuracy: 0.4796 - val_loss: 1.4143
Epoch 4/20
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 26ms/step - accuracy: 0.4685 - loss: 1.4284 - val_accuracy: 0.4796 - val_loss: 1.4131
Epoch 5/20
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 24ms/step - accuracy: 0.4739 - loss: 1.4203 - val_accuracy: 0.4796 - val_loss: 1.4155
Epoch 6/20
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 24ms/step - accuracy: 0.4736 - loss: 1.4202 - val_accuracy: 0.4796 - val_loss: 1.4110
Epoch 7/20
[1m7

<keras.src.callbacks.history.History at 0x2931b5996d0>

In [65]:
loss, accuracy = model.evaluate(X_test, y_test)

[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.4777 - loss: 1.3308


In [66]:
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Test Loss: 1.315718173980713
Test Accuracy: 0.4831428527832031
