In [1]:
# Importing the libraries
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

# Importing the dataset
dataset = pd.read_csv(r"C:\Users\rejas\Prakash Senapati sir\Prakash Senapati lab\4.CUSTOMERS REVIEW DATASET\Restaurant_Reviews.tsv", delimiter = '\t', quoting = 3)

# Duplicate the data
dataset_2 = pd.concat([dataset, dataset], ignore_index=True)

# Cleaning the texts
corpus = []
for i in range(0, 2000):
    review = re.sub('[^a-zA-Z]', ' ', dataset_2['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

# Labels
y = dataset_2.iloc[:, 1].values

In [2]:
# Feature Extraction using CountVectorizer
cv = CountVectorizer()
X_cv = cv.fit_transform(corpus).toarray()

# Feature Extraction using TfidfVectorizer
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(corpus).toarray()

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Define a function to train and evaluate models

def evaluate_model(X, y): 
    models = {
        "Logistic Regression": LogisticRegression(), 
        "Naive Bayes": MultinomialNB(),
        "Random Forest": RandomForestClassifier(),
        "XGBoost": XGBClassifier()
    }
    results = {}
    for name, model in models.items():
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results[name] = accuracy
    return results
# Evaluate models with CountVectorizer features
results_cv = evaluate_model(X_cv, y)
print("Results with CountVectorizer:")
print(results_cv)

# Evaluate models with TfidfVectorizer features

results_tfidf = evaluate_model(X_tfidf, y)
print("Results with TfidfVectorizer:")
print(results_tfidf)

Results with CountVectorizer:
{'Logistic Regression': 0.925, 'Naive Bayes': 0.875, 'Random Forest': 0.9525, 'XGBoost': 0.8425}
Results with TfidfVectorizer:
{'Logistic Regression': 0.8825, 'Naive Bayes': 0.8875, 'Random Forest': 0.95, 'XGBoost': 0.845}
