In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/rahul96rajan/sample_datasets/master/restaurant_review.tsv", sep="\t")
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
df.shape

(1000, 2)

In [4]:
df.isnull().sum()

Review    0
Liked     0
dtype: int64

In [5]:
label = df.Liked
features = df.Review

In [6]:
# Cleaning the text
def clean_text(data):
    """
    Summary
    -------
    Cleans the provided text by removing any non-alphabetic characters, removing
    stopwords and then lemmatizing each word.
     
    Params
    ------
    data (ndarray): list of unclean text reviews.
    
    Returns
    -------
    corpus (list): list of cleaned text reviews.
    """
    corpus = []
    ps = PorterStemmer()
    ws = WordNetLemmatizer()
    for msg in data:
        msg = re.sub('[^a-zA-Z]', ' ', msg)
        msg = msg.lower()
        msg = [ws.lemmatize(w) for w in nltk.word_tokenize(msg)
               if w not in stopwords.words('english')]
        msg = ' '.join(msg)
        corpus.append(msg)
    return corpus

In [7]:
preproc_data = clean_text(features.to_numpy())

In [8]:
tfv = TfidfVectorizer(max_features=2000)
vectorized_data = tfv.fit_transform(preproc_data)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(vectorized_data, label,
                                                    test_size=0.2, random_state=42)

In [10]:
X_train.shape, y_train.shape

((800, 1765), (800,))

In [11]:
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

MultinomialNB()

In [12]:
y_pred = mnb.predict(X_test)

In [13]:
print("Accuracy Score :: ", accuracy_score(y_test, y_pred))

Accuracy Score ::  0.775
