# Text Classification

## problem definition
predict if a review will be liked or not

### import required packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

### load the data

In [3]:
# read the tsv (tab separated values) file
df = pd.read_csv('Restaurant_Reviews.tsv', sep='\t')
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,1
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


### EDA

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [6]:
df.isna().sum()

Review    0
Liked     0
dtype: int64

In [7]:
df['Liked'].value_counts()

Liked
1    501
0    499
Name: count, dtype: int64

In [8]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,1
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


### pre-processing

#### build vocabulary

In [10]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# define the words to be ignored
words_to_remove = [',', '.', '//', '?', '!', '@', '#', '$', '%', '^', '*', "'", '"']
words_to_remove.extend(stopwords.words('english'))

# create an empty vocabulary
vocabulary = []

for review in df['Review']:
    # get all the words from every review
    words = word_tokenize(review.lower())

    # add the required words
    vocabulary.extend([word.strip() for word in words if word.strip() not in words_to_remove])

vocabulary = list(set(vocabulary))
len(vocabulary)

1957

In [12]:
# get all reviews
reviews = [review.lower() for review in df['Review']]

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

# create a vectorizer
vectorizer = CountVectorizer(vocabulary=vocabulary)

# convert the input reviews to numeric format
vectorizer.fit(reviews)

In [17]:
vectorizer.get_feature_names_out()

array(['wait', 'putting', 'kept', ..., 'onion', 'freezing', '45'],
      dtype=object)

In [20]:
# split the data into x and y
x = vectorizer.transform(reviews).toarray()
y = df['Liked']

In [28]:
x[0]

array([0, 0, 0, ..., 0, 0, 0])

In [26]:
# split the data into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=123456)

### train the model

In [22]:
from sklearn.naive_bayes import GaussianNB

# create a model
model = GaussianNB()

# train the model
model.fit(x_train, y_train)

### model evaluation

In [23]:
y_pred = model.predict(x_test)
y_true = y_test

In [24]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"accuray = {accuracy}")
print(f"precision = {precision}")
print(f"recall = {recall}")
print(f"f1 = {f1}")

accuray = 0.6666666666666666
precision = 0.6313131313131313
recall = 0.8223684210526315
f1 = 0.7142857142857143


In [25]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.51      0.60       148
           1       0.63      0.82      0.71       152

    accuracy                           0.67       300
   macro avg       0.68      0.66      0.66       300
weighted avg       0.68      0.67      0.66       300

