In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

In [None]:
df=pd.read_csv('/content/movie.csv')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df=df.drop_duplicates()

In [None]:
df['label'].unique()

In [None]:
sns.countplot(x='label', data=df)

plt.title('Distribution of Positive and Negative Reviews')
plt.xlabel('Label (0 = Negative, 1 = Positive)')
plt.ylabel('Count')
plt.show()

#Data Pre-Processing

Convert to lower case

In [None]:
df['lower_text']=df['text'].str.lower()

Remove special charectars and emoji

In [None]:
df['removed_text'] = df['lower_text'].astype(str).str.replace(r'[^\x00-\x7F]+', '', regex=True)
df['removed_text'] = df['lower_text'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)

Tokenization using nltk

In [None]:
df['tokens']=df['removed_text'].apply(word_tokenize)

Stemming

In [None]:
stemmer=PorterStemmer()
df['stemming']=df['tokens'].apply(lambda x: [stemmer.stem(y) for y in x])

Lametization

In [None]:
nlp=spacy.load('en_core_web_sm')
df['lametization']=df['tokens'].apply(lambda x: [nlp(y)[0].lemma_ for y in x])

Tokenization using Tensorflow

In [None]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(df['removed_text'])
df['sequences'] = tokenizer.texts_to_sequences(df['removed_text'])

Joining tokens

In [None]:
df['joined_stemming_tokens'] = df['stemming'].apply(lambda y: ' '.join(y))

In [None]:
df['joined_lammetization_tokens'] = df['lametization'].apply(lambda x: ' '.join(x))

Test/Train Split

In [None]:
X = df['joined_stemming_tokens']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Feature Extraction


One Hot Encoding

In [None]:
vectorizer1 = CountVectorizer(binary=True)
X = vectorizer1.fit_transform(df['joined_stemming_tokens'])
##one_hot_df_s = pd.DataFrame(X.toarray(), columns=vectorizer1.get_feature_names_out())

In [None]:
vectorizer2 = CountVectorizer(binary=True)
Y = vectorizer2.fit_transform(df['joined_lammetization_tokens'])
one_hot_df_l = pd.DataFrame(Y.toarray(), columns=vectorizer2.get_feature_names_out())

Bag of Words

In [None]:
vectorizer3 = CountVectorizer()
X_bow = vectorizer3.fit_transform(df['joined_stemming_tokens'])
##bow_df_s = pd.DataFrame(X_bow.toarray(), columns=vectorizer3.get_feature_names_out())

In [None]:
vectorizer4 = CountVectorizer()
Y_bow = vectorizer4.fit_transform(df['joined_lammetization_tokens'])
bow_df_l = pd.DataFrame(Y_bow.toarray(), columns=vectorizer4.get_feature_names_out())

Tf-Idf

In [None]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
tfidf_vectorizer1 = TfidfVectorizer()
Y_tfidf = tfidf_vectorizer1.fit_transform(df['joined_lammetization_tokens'])
tfidf_df_l = pd.DataFrame(Y_tfidf.toarray(), columns=tfidf_vectorizer1.get_feature_names_out())

Count Vectorizer

In [None]:
vectorizer = CountVectorizer(binary=True)
X_train_bin = vectorizer.fit_transform(X_train)
X_test_bin = vectorizer.transform(X_test)

#Logistic Regression

In [None]:
model = LogisticRegression()
model.fit(X_train_vec, y_train)
y_pred = model.predict(X_test_vec)
y_pred_prob = model.predict_proba(X_test_vec)[:, 1]

Accuracy Score

In [None]:
acc = accuracy_score(y_test, y_pred)
print("Accuracy Score:", acc)

F1 Score

In [None]:
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

ROC-AUC Score

In [None]:
roc = roc_auc_score(y_test, y_pred_prob)
print("ROC-AUC Score:", roc)

Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

#Bernoulli Naive Bayes Classifier

In [None]:
bnb = BernoulliNB()
bnb.fit(X_train_bin, y_train)
y_pred0 = bnb.predict(X_test_bin)
y_pred_prob0 = bnb.predict_proba(X_test_bin)[:, 1]

Accuracy Score

In [None]:
ac = accuracy_score(y_test, y_pred0)
print("Accuracy Score:", acc)

F1 Score

In [None]:
f1 = f1_score(y_test, y_pred0)
print("F1 Score:", f1)

ROC-AUC Score

In [None]:
roc = roc_auc_score(y_test, y_pred_prob0)
print("ROC-AUC Score:", roc)

Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred0)
print("Confusion Matrix:\n", cm)

#SVM (Support Vector Machine)

In [None]:
svm = SVC(kernel='linear')
svm.fit(X_train_vec, y_train)
y_pred1 = svm.predict(X_test_vec)
y_pred_prob1 = svm.predict_proba(X_test_vec)[:, 1]

Accuracy Score

In [None]:
acc = accuracy_score(y_test, y_pred1)
print("Accuracy Score:", acc)

F1 Score

In [None]:
f1 = f1_score(y_test, y_pred1)
print("F1 Score:", f1)

ROC-AUC Score

In [None]:
roc = roc_auc_score(y_test, y_pred_prob1)
print("ROC-AUC Score:", roc)

Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred1)
print("Confusion Matrix:\n", cm)

#Random Forest

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_vec, y_train)

In [None]:
y_pred7 = rf_model.predict(X_test_vec)
y_pred_prob7 = rf_model.predict_proba(X_test_vec)[:, 1]

Accuracy Score

In [None]:
accuracy = accuracy_score(y_test, y_pred7)
print("Accuracy Score: ",accuracy)

F1 Score

In [None]:
f1 = f1_score(y_test, y_pred7)
print("F1 Score: ", f1)

ROC-AUC Score

In [None]:
roc_auc = roc_auc_score(y_test, y_pred_prob7)
print(f"ROC-AUC Score: {roc_auc}")

Confusion matrix

In [None]:
cm = confusion_matrix(y_test, y_pred7)
print("Confusion Matrix:")
print(cm)