### Import Libraries

In [None]:
import os
import json
from tqdm import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from langdetect import detect
import regex as re
import wordcloud
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
import nltk
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize
import re


import spacy
myspacy = spacy.load('en_core_web_sm')
nltk.download('stopwords')
nltk.download('punkt')

import warnings
warnings.filterwarnings('ignore')

### Import Dataset

#### Converting JSON to DataFrame

In [None]:
reviews = {'review_id' : [], 'business_id' : [], 'user_id' : [], 
           'stars': [], 'text': [], 'date' : [], 
           'useful' : [], 'funny': [], 'cool': []}

business = {'business_id' : [], 'name' : [], 'address' : [], 
           'city': [], 'state': [], 'postal_code' : [], 
           'stars' : [], 'review_count': [], 'is_open': [], 'categories': []}

users = {'user_id': [],'name': [],'review_count': [],
        'yelping_since': [],'useful': [],'funny': [],
        'cool': [],'elite': [],'fans': [],'friends': []}

def convert2df(filename,df_structure):
    with open(filename) as f:
        for line in tqdm(f):
            row = json.loads(line)
                for i in df_structure.keys():
                    df_structure[i].append(row[i])
    return df_structure

In [None]:
business_dict = convert2df("yelp_academic_dataset_business.json",business)
business_df = pd.DataFrame.from_dict(business_dict)
business_df.head()

In [None]:
reviews_dict = convert2df("yelp_academic_dataset_reviews.json",reviews)
reviews_df = pd.DataFrame.from_dict(reviews_dict)
reviews_df.head()

In [None]:
users_dict = convert2df("yelp_academic_dataset_users.json",users)
users_df = pd.DataFrame.from_dict(users_dict)
users_df.head()

### Data Filtering

#### Filter 1 ( you can save the file into .csv after every step to avoid repeating steps due to system hangs due to OutOfMemoryError )
- Filtering only Restaurant Businesses , which are open and have more than 50 reviews.
- Retain the reviews of the filtered restaurants
- Retain the users who have given more than 50 reviews

In [None]:
business_df = business_df[(business_df['categories'].str.contains('Restaurants')==True) & 
                            (business_df["is_open"]== 1) & (business_df["review_count"] >= 50)

In [None]:
restaurant_ids = business_df["business_id"].unique()
reviews_df =  reviews_df[reviews_df.business_id.isin(restaurant_ids)]

In [None]:
users_df = users_df[users_df.review_count>50]
users_df.shape

#### Filter 2 ( you can save the file into .csv after every step to avoid repeating steps due to system hangs due to OutOfMemoryError )
- Group by Restaurants based on City and State , and aggregate the review count to find the City-State having 100000 reviews
- Retain the reviews of filtered restaurants
- Retain the users whose reviews are retained in reviews_df

In [None]:
business_df.groupby(['state','city']).agg({"review_count":"sum"}).sort_values(by='review_count',ascending=False)

In [None]:
business_df = business_df[((business_df['state'] == 'MA') & (business_df['city'] == 'Cambridge'))]

In [None]:
restaurant_ids = business_df["business_id"].unique()
reviews_df =  reviews_df[reviews_df.business_id.isin(restaurant_ids)]

In [None]:
unique_users = pd.DataFrame(cambridge_reviews["user_id"].unique(),columns=['user_id'])

In [None]:
filterd_user_ids = unique_users['user_id'].tolist()
users_df =  users_df[users_df.user_id.isin(filterd_user_ids)]

#### Saving the dataframes into .csv to avoid repeating the steps again

In [None]:
business_df.to_csv('filtered_business.csv',index= False)
reviews_df.to_csv('filtered_reviews.csv',index= False)
users_df.to_csv('filtered_users.csv',index= False)

In [None]:
business_df = pd.read_csv('filtered_business.csv')
business_df = pd.read_csv('filtered_reviews.csv')
business_df = pd.read_csv('filtered_users.csv')

### Data Cleaning

#### To remove Non-English Reviews from the dataset ( and update reviews_df and users_df )

In [None]:
non_eng_review_ids = []
reviews_df_1 = reviews_df
reviews_df_1['lang'] = reviews_df_1['text'].apply(detect)
reviews_df_1 = reviews_df_1[reviews_df_1.lang!='en']

In [None]:
non_eng_review_ids.append(reviews_df_1.review_id)
non_enlish_review_id_alone = []
                          
for i in non_eng_review_ids:
    for j in i:
        non_enlish_review_id_alone.append(j)
total_non_english_review = len(non_enlish_review_id_alone)
print("total non english reviews :",total_non_english_review)
df = pd.DataFrame(non_enlish_review_id_alone)
df.to_csv('non_eng_review.csv')

In [None]:
df = pd.read_csv('non_eng_review.csv')
non_enlish_review_id_alone = pd.DataFrame(df).to_numpy()

In [None]:
index_reviews_to_be_dropped = []
for i in range(len(non_enlish_review_id_alone)):
    reviews_to_be_dropped = reviews_df[reviews_df['review_id']==non_enlish_review_id_alone[i][1]]['text']
    index_reviews_to_be_dropped.append(reviews_to_be_dropped.first_valid_index())
reviews_df = reviews_df.drop(index_reviews_to_be_dropped)

In [None]:
unique_users = pd.DataFrame(reviews_df["user_id"].unique(),columns=['user_id'])
filterd_user_ids = unique_users['user_id'].tolist()
users_df =  users_df[users_df.user_id.isin(filterd_user_ids)]

### Exploratory Data Analysis

In [None]:
def basic_data_report(df_report,drop_duplicate):
    BOLD = '\033[1m'
    END = '\033[0m'
    categorical_variables = df_report.select_dtypes(include = 'object')
    print(BOLD + "head"+ END, df_report.head(),"\n")
    print(BOLD + "rows,columns :"+END,df_report.shape,"\n")
    print(BOLD +"Column & Data Type \n"+END,df_report.dtypes,"\n")
    print(BOLD +"Columns and Corresponding nullValues\n"+END,df_report.isnull().sum(),"\n")
    print(BOLD + "Sample \n"+END,df_report.sample(5),"\n")
    print(BOLD + "Number of Duplicated rows \n"+END,df_report.duplicated().sum(),"\n")
    print(BOLD + "Describing the numeric features \n"+END,df_report.describe(),"\n")
    print(BOLD+"Describing Categorical variables"+END)
    for idx,variable in enumerate(categorical_variables.columns):
        print(BOLD+variable+END,"\n",categorical_variables[variable].value_counts(),"\n")
    if drop_duplicate:
        global df
        df = df_report.drop_duplicates()
        print("rows after removing duplicates",df.shape[0])

#### business_df

In [None]:
basic_data_report(business_df,True)

**Insights**
1.   We are only considering Cambridge city and there are 331 businesses there
2.   Most popular restaurant chains are Clovers food lab , Darvins 
3.   Pizza and Mexican are most common categories 
4.   We dont have any null values in this business dataframe. 

In [None]:
x= business_df['stars'].value_counts()
x = x.sort_index()

plt.figure(figsize=(8,4))
ax = sns.barplot(x.index,x.values,alpha = 0.8)
plt.title('Star rating Distribution')
plt.ylabel('No of restaurant',fontsize = 12)
plt.xlabel('Star ratings',fontsize = 12)

**Insights**
1. 4 is the most common rating given and few restaurants have 5 ratings. Lets learn more about the restaurants with top ratings and review counts

In [None]:
## Restaurants with top ratings and review count

restaurant_with_rating = business_df[['name','stars','review_count','city','state']]
restaurant_with_rating.sort_values(by = ['stars','review_count'],ascending = [False,False])

In [None]:
## Total Top Restaurants in Current City

Cambridge_top_df = restaurant_with_rating[(restaurant_with_rating.city=='Cambridge') & (restaurant_with_rating.stars>=4) & (restaurant_with_rating.review_count>= 100)]
print("Top restaurants in current city ",len(Cambridge_top_df))

In [None]:
## Explore number of reviews versus star rates
import seaborn as sns
sns.swarmplot(x="stars", y="review_count", data=business_df).set_title("Relationship of stars and number of reviews")

In [None]:
## Analyzing the Tags given for Restaurants

restaurant_categories = []
for each_business in business_df.categories:
    seperate_each_business = each_business.split(',')
    for unique_category in seperate_each_business:
        if unique_category not in restaurant_categories:
            restaurant_categories.append(unique_category)  
print(restaurant_categories)

In [None]:
## Plot the number of restaurants for different categories

Category=restaurant_categories
cat=[]
for i in business_df['categories'].values:
    cator=[j for j in Category if j in i]
    if cator != []:
        cator=cator[0]
    else:
        cator='Others'
    cat.append(cator)
business_df['Simple_Category']=cat
category_number=business_df.groupby('Simple_Category').count()
plot_x=category_number.index.values
plot_y=category_number['business_id'].values

#chart
plt.figure(figsize=(30,4))
ax = sns.barplot(plot_x, plot_y, alpha=0.8)
plt.title("Number of restaurants for different categories",fontsize=12)
locs, labels = plt.xticks()
plt.ylabel('Number', fontsize=12)
plt.xlabel('Restaurant Categorye', fontsize=12)

#adding the text labels
rects = ax.patches
labels = plot_y
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')

plt.show()

In [None]:
## Word Cloud Generation for Tags of Restaurants

def wc(data,bgcolor,title):
    plt.figure(figsize = (50,50))
    wc = wordcloud.WordCloud(background_color = bgcolor, max_words = 100,  max_font_size = 25)
    wc.generate(' '.join(data))
    plt.imshow(wc)
    plt.axis('off')
wc(restaurant_categories,'white','most common categories')

##### reviews_df

In [None]:
basic_data_report(reviews_df,True)

In [None]:
useful_reviews = len(reviews_df[reviews_df["useful"]>0])
cool_reviews = len(reviews_df[reviews_df["cool"]>0])
funny_reviews = len(reviews_df[reviews_df["funny"]>0])
negative_reviws = len(reviews_df[reviews_df["stars"]<=3])
positive_reviews =len(reviews_df[reviews_df["stars"]>3])
total_reviews = len(reviews_df)

print("Total reviews: {}".format(total_reviews))
print("Useful reviews: {}".format(useful_reviews))
print("Funny reviews: {}".format(funny_reviews))
print("Cool reviews: {}".format(cool_reviews))
print("Total negative reviews: {}".format(negative_reviws))
print("Total positive reviews: {}".format(positive_reviews))

In [None]:
bag_of_words = reviews_df.text.str.lower().str.cat(sep = ' ')
bag_of_words = bag_of_words.replace('\n',' ')
stop_words = list(get_stop_words('en'))         
nltk_words = list(stopwords.words('english'))   
stop_words.extend(nltk_words)
extra_words = ['ve', 'like', 'got', 'Cleveland', 'just', 'don', 'really', 'said', 'told', 'ok',
               'came', 'went', 'did', 'didn', 'good','also','even','restaurant','ordered','order',
               'much','well','back','menu','little','still','think','take','looking','look','although',
               'enough','probably','still','give','everything','want','usually','always']
stop_words.extend(extra_words)

In [None]:
word_tokens = word_tokenize(bag_of_words)

filtered_sentence = []
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)
        
# Remove characters which have length less than 2  
without_single_chr = [word for word in filtered_sentence if len(word) > 3]

# Remove numbers
cleaned_data_title = [word for word in without_single_chr if not word.isnumeric()]

In [None]:
top_N = 100
word_dist = nltk.FreqDist(cleaned_data_title)
rslt = pd.DataFrame(word_dist.most_common(top_N),
                    columns=['Word', 'Frequency'])

plt.figure(figsize=(15,5))
sns.set_style("whitegrid")
ax = sns.barplot(x="Word",y="Frequency", data=rslt.head(20))

In [None]:
## Most used words in reviews
wc(cleaned_data_title,'black','Most Used Words')

##### GetBigramsForRestaurant 

In [None]:
def GetBiGramForReview(txt1):
    vectorizer = CountVectorizer(stop_words = stop_words,ngram_range =(2, 2))
    X1 = vectorizer.fit_transform(txt1) 
    features = (vectorizer.get_feature_names())
    #print("\n\nX1 : \n", X1.toarray())

    # Applying TFIDF
    # You can still get n-grams here
    vectorizer = TfidfVectorizer(stop_words = stop_words,ngram_range = (2, 2))
    X2 = vectorizer.fit_transform(txt1)
    scores = (X2.toarray())
    #print("\n\nScores : \n", scores)

    # Getting top ranking features
    sums = X2.sum(axis = 0)
    data1 = []
    for col, term in enumerate(features):
        data1.append( (term, sums[0, col] ))
    ranking = pd.DataFrame(data1, columns = ['term', 'rank'])
    words = (ranking.sort_values('rank', ascending = False))
    print ("\n\nWords : \n", words.head(10))

In [None]:
#Get all the reviews for this particular restaurant when passed restaurant id(business id)
reviews_particular_id = reviews_df[reviews_df['business_id']=='bul_5Ahk_QYLUAJ4Od27jg']
pos_reviews = reviews_particular_id[reviews_particular_id.stars > 3].text
neg_reviews = reviews_particular_id[reviews_particular_id.stars<=3].text
print("Bigrams for Positive Reviews")
GetBiGramForReview(pos_reviews)
print("Bigrams for Negative Reviews")
GetBiGramForReview(neg_reviews)

#### users_df

In [None]:
basic_data_report(users_df,True)

In [None]:
(users_df['friends']=="None").sum()

In [None]:
users_df = users_df[users_df['friends']!='None']

In [None]:
## Printing the number of friends each user got and maximum number of friends people got
no_of_friends_of_each_user = []
for idx,friend in enumerate(users_df.friends):
    eachfriend = friend.split(',')
    no_of_friends_of_each_user.append(len(eachfriend))
users_df['Number of friends'] = no_of_friends_of_each_user
users_df

##### Removing the friends from other cities


In [None]:
user_friends = []
number_of_friends = []

for idx,friend in enumerate(users_df.friends):
    eachuser = friend.split(',')
    print(eachuser)
    for idy,each_friend in enumerate(eachuser):
        if (each_friend.strip() in filterd_user_ids):
            pass
        else:
            eachuser.remove(each_friend)
    user_friends.append(eachuser)
    number_of_friends.append(len(eachuser))
    print('\n')

users_df['friends'] = user_friends
users_df['Number of friends'] = number_of_friends

In [None]:
#friends distribution before removing the friends from other cities
fig, ax = plt.subplots(1, 1)

# Add axis labels
ax.set_xlabel('NumberOffriends')
ax.set_ylabel('Frequency');
ax.set_yscale('log')
# Generate the histogram for the low-density fed mother
ax.hist(no_of_friends_of_each_user,bins = 50)
plt.show()

In [None]:
#friends distribution (considering only friends from cambridge)
fig, ax = plt.subplots(1, 1)

# Add axis labels
ax.set_xlabel('NumberOffriends')
ax.set_ylabel('Frequency');
ax.set_yscale('log')
# Generate the histogram for the low-density fed mother
ax.hist(number_of_friends,bins = 50)
plt.show()

### Save Final Snapshot of Dataframes

In [None]:
business_df.to_csv('cleaned_business.csv',index = False)
reviews_df.to_csv('cleaned_reviews.csv',index= False)
users_df.to_csv('cleaned_users.csv',index=False)