In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
import nltk
nltk.download('stopwords')

import nltk
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import pickle

# Load dataset
dataset = pd.read_csv("/content/drive/MyDrive/ML/all_kindle_review.csv")
df = dataset[['reviewText', 'rating']]
df.dropna(inplace=True)

# Preprocessing
df['rating'] = df['rating'].apply(lambda x: 0 if x < 3 else 1)
df['reviewText'] = df['reviewText'].str.lower()
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub('[^a-z A-Z 0-9-]+', '', x))
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join([y for y in x.split() if y not in stopwords.words('english')]))
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://[\w_-]+(?:\.[\w_-]+)+[\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-]?', '', x))
df['reviewText'] = df['reviewText'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join(x.split()))

lemmatizer = WordNetLemmatizer()
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split()]))

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(df['reviewText'], df['rating'], test_size=0.20, random_state=42,stratify=df['rating'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['rating'] = df['rating'].apply(lambda x: 0 if x < 3 else 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['reviewText'] = df['reviewText'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See

In [25]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [19]:
# Pipeline with BoW
pipeline_bow = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

# Cross-validation
cv = StratifiedKFold(n_splits=5)
cv_scores = cross_val_score(pipeline_bow, X_train, y_train, cv=cv, scoring='accuracy')
print(f"Cross-validation Accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")

# Fit and save the pipeline
pipeline_bow.fit(X_train, y_train)
with open('/content/drive/MyDrive/ML/pipeline_bow.pkl', 'wb') as file:
    pickle.dump(pipeline_bow, file)

Cross-validation Accuracy: 0.83 ± 0.01


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Pipeline with TF-IDF
pipeline_tfidf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

cv = StratifiedKFold(n_splits=5)
cv_scores = cross_val_score(pipeline_tfidf, X_train, y_train, cv=cv, scoring='accuracy')
print(f"Cross-validation Accuracy: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")

pipeline_tfidf.fit(X_train, y_train)

# Save the pipeline
with open('/content/drive/MyDrive/ML/pipeline_tfidf.pkl', 'wb') as file:
    pickle.dump(pipeline_tfidf, file)


Cross-validation Accuracy: 0.69 ± 0.00


In [20]:
import pickle
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report

# Load the BoW pipeline
with open('/content/drive/MyDrive/ML/pipeline_bow.pkl', 'rb') as file:
    pipeline_bow = pickle.load(file)

# Predict on the test set
y_pred_bow = pipeline_bow.predict(X_test)

# Calculate accuracy
accuracy_bow = accuracy_score(y_test, y_pred_bow)
print(f"BoW Model Accuracy: {accuracy_bow:.2f}")

# Confusion matrix
conf_matrix_bow = confusion_matrix(y_test, y_pred_bow)
print("BoW Confusion Matrix:")
print(conf_matrix_bow)

# Classification report
report_bow = classification_report(y_test, y_pred_bow)
print("BoW Classification Report:")
print(report_bow)

BoW Model Accuracy: 0.85
BoW Confusion Matrix:
[[ 577  223]
 [ 142 1458]]
BoW Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.72      0.76       800
           1       0.87      0.91      0.89      1600

    accuracy                           0.85      2400
   macro avg       0.83      0.82      0.82      2400
weighted avg       0.85      0.85      0.85      2400



In [22]:
# Load the TF-IDF pipeline
with open('/content/drive/MyDrive/ML/pipeline_tfidf.pkl', 'rb') as file:
    pipeline_tfidf = pickle.load(file)

# Predict on the test set
y_pred_tfidf = pipeline_tfidf.predict(X_test)

# Calculate accuracy
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
print(f"TF-IDF Model Accuracy: {accuracy_tfidf:.2f}")

# Confusion matrix
conf_matrix_tfidf = confusion_matrix(y_test, y_pred_tfidf)
print("TF-IDF Confusion Matrix:")
print(conf_matrix_tfidf)

# Classification report
report_tfidf = classification_report(y_test, y_pred_tfidf)
print("TF-IDF Classification Report:")
print(report_tfidf)

TF-IDF Model Accuracy: 0.70
TF-IDF Confusion Matrix:
[[  71  729]
 [   2 1598]]
TF-IDF Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.09      0.16       800
           1       0.69      1.00      0.81      1600

    accuracy                           0.70      2400
   macro avg       0.83      0.54      0.49      2400
weighted avg       0.78      0.70      0.60      2400



In [23]:
import pandas as pd
import re
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec

# Load dataset
dataset = pd.read_csv("/content/drive/MyDrive/ML/all_kindle_review.csv")
df = dataset[['reviewText', 'rating']]
df.dropna(inplace=True)

# Preprocessing
df['rating'] = df['rating'].apply(lambda x: 0 if x < 3 else 1)
df['reviewText'] = df['reviewText'].str.lower()
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub('[^a-z A-Z 0-9-]+', '', x))
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join([y for y in x.split() if y not in stopwords.words('english')]))
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://[\w_-]+(?:\.[\w_-]+)+[\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-]?', '', x))
df['reviewText'] = df['reviewText'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join(x.split()))

lemmatizer = WordNetLemmatizer()
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split()]))

# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(df['reviewText'], df['rating'], test_size=0.20, random_state=42, stratify=df['rating'])

# Tokenize the text for Word2Vec
X_train_tokens = [review.split() for review in X_train]
X_test_tokens = [review.split() for review in X_test]

# Train Word2Vec model
w2v_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1, workers=4)

# Function to compute average Word2Vec for each review
def compute_avg_w2v(tokens, model):
    avg_vector = []
    for review in tokens:
        vectors = [model.wv[word] for word in review if word in model.wv]
        if vectors:
            avg_vector.append(sum(vectors) / len(vectors))
        else:
            avg_vector.append([0] * model.vector_size)
    return avg_vector

# Compute AvgWord2Vec for train and test data
X_train_avg_w2v = compute_avg_w2v(X_train_tokens, w2v_model)
X_test_avg_w2v = compute_avg_w2v(X_test_tokens, w2v_model)

# Train a Logistic Regression classifier
classifier = LogisticRegression()
classifier.fit(X_train_avg_w2v, y_train)

# Save the classifier and Word2Vec model
with open('/content/drive/MyDrive/ML/classifier_avg_w2v.pkl', 'wb') as file:
    pickle.dump(classifier, file)
with open('/content/drive/MyDrive/ML/w2v_model.pkl', 'wb') as file:
    pickle.dump(w2v_model, file)



In [24]:
# Load the classifier and Word2Vec model
with open('/content/drive/MyDrive/ML/classifier_avg_w2v.pkl', 'rb') as file:
    classifier = pickle.load(file)
with open('/content/drive/MyDrive/ML/w2v_model.pkl', 'rb') as file:
    w2v_model = pickle.load(file)

# Predict on the test set
y_pred_avg_w2v = classifier.predict(X_test_avg_w2v)

# Calculate accuracy
accuracy_avg_w2v = accuracy_score(y_test, y_pred_avg_w2v)
print(f"AvgWord2Vec Model Accuracy: {accuracy_avg_w2v:.2f}")

# Confusion matrix
conf_matrix_avg_w2v = confusion_matrix(y_test, y_pred_avg_w2v)
print("AvgWord2Vec Confusion Matrix:")
print(conf_matrix_avg_w2v)

# Classification report
report_avg_w2v = classification_report(y_test, y_pred_avg_w2v)
print("AvgWord2Vec Classification Report:")
print(report_avg_w2v)

AvgWord2Vec Model Accuracy: 0.75
AvgWord2Vec Confusion Matrix:
[[ 372  428]
 [ 179 1421]]
AvgWord2Vec Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.47      0.55       800
           1       0.77      0.89      0.82      1600

    accuracy                           0.75      2400
   macro avg       0.72      0.68      0.69      2400
weighted avg       0.74      0.75      0.73      2400




- **BOW : 85 %**
- **TF-IDF : 70%**
- **AvgWord2Vec : 75%**
