In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In this notebook, I'm using Natural Language Processing (NLP) strategies to analyze Yelp reviews data.

In [None]:
import numpy as np # linear algebra
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# load dataset
yelp_df = pd.read_csv("/kaggle/input/yelp-reviews/yelp.csv")

In [None]:
# examine the data
yelp_df.head()

In [None]:
# print the summary statistics of numercial columns
yelp_df.describe()

In [None]:
# Verify the datatypes and check for any nulls
yelp_df.info()

In [None]:
# print the example text review
yelp_df.text[0]

In [None]:
# calculate the length of reviews
yelp_df['length'] = yelp_df.text.apply(len)

In [None]:
# plot the histogram for the length values
yelp_df.length.plot(bins = 20, kind='hist')

In [None]:
yelp_df.length.describe()

In [None]:
# print the review with highest chars 
yelp_df[yelp_df.length==4997].text.iloc[0]

In [None]:
# print the reviews with lowest char
yelp_df[yelp_df.length == 1].text.iloc[0]

In [None]:
# plot the count of reviews
sns.countplot(yelp_df.stars,palette='GnBu_d')

Most Reviews fall between 4 & 5 star

In [None]:
# plot histograms for each stars
g =sns.FacetGrid(data =yelp_df,col='stars',col_wrap=3)
g.map(plt.hist,'length',bins=20,color='orange')

By examing the histograms we can conclude that the people rating 1-3 stars  mostly write less than 400 words.

In [None]:
# prepare the data for prediction
sns.countplot(yelp_df[yelp_df.stars!=3].stars)

We will mpa the stars 4 & 5 as 1 (happy Customer) and stars 1 & 2 as 0 (Unhappy Customer) and exclude star 3 as it will have neutral reviews.

In [None]:
# exclude all records having with star 3
yelp_df = yelp_df[yelp_df.stars!=3]

In [None]:
# create function to calcualte the target value
def create_target(stars):
    if stars<3:
        target = 0#
    else:
        target = 1
    return target

In [None]:
# store the value into target column
yelp_df['target'] = yelp_df.stars.apply(create_target)

In [None]:
# examine the values
yelp_df[['stars','target']]

In [None]:
# remove punctuation
import string
string.punctuation

In [None]:
# remove stopwords
from nltk.corpus import stopwords
stopwords.words('english')

In [None]:
# defining the fuction to remove punctuations & stop words
def text_cleaning(text):
    remove_punctuation = ''.join([char for char in text if char not in string.punctuation])
    remove_stopwords = [word for word in remove_punctuation.split() if word.lower() not in stopwords.words('english')]
    return remove_stopwords

In [None]:
# count vectorization ( 2d matrix containing word frequency)
from sklearn.feature_extraction.text import CountVectorizer
CountVectorizer = CountVectorizer(analyzer = text_cleaning)
yelp_vectorizer=CountVectorizer.fit_transform(yelp_df.text)

In [None]:
yelp_vectorizer.shape

In [None]:
X =yelp_vectorizer
y = yelp_df.target.values.reshape(-1,1)
print(X.shape)
print(y.shape)

In [None]:
# split the data into train and test
from sklearn.model_selection import train_test_split
X_train ,X_test, y_train, y_test = train_test_split(X,y,test_size =0.2)

In [None]:
# train model
from sklearn.naive_bayes import MultinomialNB
NB_classifier = MultinomialNB()
NB_classifier.fit(X_train,y_train)

In [None]:
# print the confusion matrix for trained data
from sklearn.metrics import classification_report,confusion_matrix
predict_train = NB_classifier.predict(X_train)
cm = confusion_matrix(y_train,predict_train)
sns.heatmap(cm,annot =True,cmap="Blues")
plt.ylabel("Actual")
plt.xlabel("Predicted")
print(cm)


In [None]:
print(classification_report(y_train,predict_train))

In [None]:
# print the confusion matrix for test data
predict_test = NB_classifier.predict(X_test)
cm = confusion_matrix(y_test,predict_test)
sns.heatmap(cm,annot =True,cmap="Blues")
plt.ylabel("Actual")
plt.xlabel("Predicted")
print(cm)


In [None]:
print(classification_report(y_test,predict_test))