In [1]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer


In [2]:
# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Initialize stop words and lemmatizer

lemmatizer = WordNetLemmatizer()
default_stop_words = set(stopwords.words('english'))

# Define set of negation words that we don't want to remove
negation_words = {"not", "no", "nor", "neither", "never", "n't"}
#Create a custom stopwords list exclding the negation words
custom_stopwords = default_stop_words - negation_words

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rasna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rasna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rasna\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rasna\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# 
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Text cleaning function
def clean_text(text):
    text = text.lower()
    # removing all the number and special characters
    text = re.sub(r'[^a-z\s]', '', text)
    text = text.split()
    text = [word for word in text if word not in custom_stopwords]
    text = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in text]
    
    text = ' '.join(text)
    return text


# Improved text cleaning function with lemmatization and negation handling
def clean_text_advanced(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words if word not in custom_stopwords]
    
    # Negation handling
    clean_words = []
    negate = False
    for word in lemmatized_words:
        if word in ["not", "no", "never", "n't"]:
            negate = True
            continue
        if negate:
            word = "not_" + word
            negate = False
        clean_words.append(word)
    
    return ' '.join(clean_words)

# testing clean_text
print(clean_text('This is not the 2nd comment! Better than before#'))

not nd comment well


In [4]:
# Load the data set

data = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t')

In [5]:
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [6]:
# Preproces the reviews
data['Review'] = data['Review'].apply(clean_text)
data.head()

Unnamed: 0,Review,Liked
0,wow love place,1
1,crust not good,0
2,not tasty texture nasty,0
3,stop late may bank holiday rick steve recommen...,1
4,selection menu great price,1


In [7]:
# Split the data
X = data['Review']
y = data['Liked']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
# TF_IDF vectorizer
tfidf = TfidfVectorizer(ngram_range=(1, 2))

In [9]:
# Model pipeline

In [10]:
pipeline = Pipeline([
    ('tfidf', tfidf),
    ('clf', RandomForestClassifier(random_state=42))
])

In [11]:
# Hyperparameter tuning
param_grid = {
    'clf__n_estimators': [150, 200, 250],
    'clf__max_depth': [None, 10, 20, 30],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4, 8]
}

In [12]:
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


In [13]:
# Best model evaluation
best_model = grid_search.best_estimator_
print(best_model)
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print('Accuracy: ', accuracy)

Pipeline(steps=[('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
                ('clf',
                 RandomForestClassifier(min_samples_split=10, n_estimators=250,
                                        random_state=42))])
Accuracy:  0.78


## Gradient Boosting Classifier

In [14]:
from sklearn.ensemble import GradientBoostingClassifier

# Model pipeline with Gradient Boosting
pipeline_gb = Pipeline([
    ('tfidf', tfidf),
    ('clf', GradientBoostingClassifier(random_state=42))
])

# Hyperparameter tuning for Gradient Boosting
param_grid_gb = {
    'clf__n_estimators': [150, 200, 250],
    'clf__learning_rate': [0.01, 0.1, 0.2],
    'clf__max_depth': [3, 5, 7],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

grid_search_gb = GridSearchCV(pipeline_gb, param_grid_gb, cv=3, n_jobs=-1, verbose=2)
grid_search_gb.fit(X_train, y_train)

# Best model evaluation
best_model_gb = grid_search_gb.best_estimator_

y_pred_gb = best_model_gb.predict(X_test)
accuracy_gb = accuracy_score(y_test, y_pred_gb)

print("Accuracy with Gradient Boosting:", accuracy_gb)


Fitting 3 folds for each of 243 candidates, totalling 729 fits
Accuracy with Gradient Boosting: 0.76


In [15]:
print(best_model_gb)

Pipeline(steps=[('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
                ('clf',
                 GradientBoostingClassifier(learning_rate=0.2, max_depth=5,
                                            min_samples_split=5,
                                            n_estimators=150,
                                            random_state=42))])


## SVM model:

In [16]:
from sklearn.svm import SVC

# Model pipeline with SVM
pipeline_svm = Pipeline([
    ('tfidf', tfidf),
    ('clf', SVC(random_state=42))
])

# Hyperparameter tuning for SVM
param_grid_svm = {
    'clf__C': [0.1, 1, 10, 100],
    'clf__gamma': [1, 0.1, 0.01, 0.001],
    'clf__kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=3, n_jobs=-1, verbose=2)
grid_search_svm.fit(X_train, y_train)

# Best model evaluation
best_model_svm = grid_search_svm.best_estimator_
y_pred_svm = best_model_svm.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)

print(best_model_svm)
print("Accuracy with SVM:", accuracy_svm)


Fitting 3 folds for each of 64 candidates, totalling 192 fits
Pipeline(steps=[('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
                ('clf', SVC(C=1, gamma=1, kernel='linear', random_state=42))])
Accuracy with SVM: 0.8233333333333334


## Implementing Word Embeddings with SVM
### Load Pre-trained GloVe Embeddings

In [17]:
import numpy as np

# Load GloVe embeddings
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_path = 'glove.6B.100d.txt'
embeddings_index = load_glove_embeddings(glove_path)


### Transform Text Data to Embedding Averages

In [18]:
# Function to convert text to average of word embeddings
def text_to_embedding(text, embeddings_index, dim=100):
    words = text.split()
    embedding = np.zeros(dim)
    count = 0
    for word in words:
        if word in embeddings_index:
            embedding += embeddings_index[word]
            count += 1
    if count != 0:
        embedding /= count
    return embedding

# Transform the reviews
X_train_embeddings = np.array([text_to_embedding(text, embeddings_index) for text in X_train])
X_test_embeddings = np.array([text_to_embedding(text, embeddings_index) for text in X_test])


### Train the SVM Model

In [19]:
# # Define the function to convert text to the average of word embeddings
# def text_to_embedding(text, embeddings_index, dim=100):
#     words = text.split()
#     embedding = np.zeros(dim)
#     count = 0
#     for word in words:
#         if word in embeddings_index:
#             embedding += embeddings_index[word]
#             count += 1
#     if count != 0:
#         embedding /= count
#     return embedding

# # Assuming you have X_train, X_test, y_train, y_test, and embeddings_index defined
# # Transform the reviews
# X_train_embeddings = np.array([text_to_embedding(text, embeddings_index) for text in X_train])
# X_test_embeddings = np.array([text_to_embedding(text, embeddings_index) for text in X_test])

# Define the parameter grid
param_grid_svm = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

# Model with SVM
svm = SVC(random_state=42)

# Hyperparameter tuning with GridSearchCV
grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=3, n_jobs=-1, verbose=2)
grid_search_svm.fit(X_train_embeddings, y_train)

# Best model evaluation
best_model_svm = grid_search_svm.best_estimator_
y_pred_svm = best_model_svm.predict(X_test_embeddings)
accuracy_svm = accuracy_score(y_test, y_pred_svm)

print("Best parameters found: ", grid_search_svm.best_params_)
print("Accuracy with tuned SVM:", accuracy_svm)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters found:  {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
Accuracy with tuned SVM: 0.7666666666666667
