In [301]:
import pandas as pd

# Machine Learning packages
from sklearn.feature_extraction.text import CountVectorizer # to create Bag of words
from sklearn.model_selection import train_test_split  # for splitting data
from sklearn.linear_model import LogisticRegression # to bulid classifier model
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder # to convert classes to number 
from sklearn.metrics import accuracy_score, classification_report, plot_confusion_matrix # to calculate accuracy and classification report

# NLP libraries
import re # for preprocessing text
import string # for preprocessing text
import nltk # for processing texts
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords # list of stop words
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/ssenapati/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ssenapati/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/ssenapati/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [302]:
# to change text style 
class style:
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

### **Load dataset**

In [303]:
df = pd.read_csv('data/coffee_shops_reviews.csv')

In [304]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,shop_name,rating,numbers_of_rating,price,shop_type,key_words,address,reviews
0,0,Elixir Bunn Coffee,4.3,218,0,cafe,none,"King Abdullah Rd, حي الحمراء، Riyadh 13215",['Amazing new branch for my favorite coffee ho...
1,1,Chamonix Cafe,4.0,563,3,coffee shop,"Late-night food, Breakfast, Outdoor seating","9259 Wadi Al Awsat, Al nbsp;2430, Riyadh",['Its really romantic and lovely place with go...
2,2,dr.CAFE COFFEE,4.1,1571,2,coffee shop,"Cosy, Casual, Vegetarian options","As Sulimaniyah, Khurais Road Abi Al Arab Stree...","[""Sandwich wasn't tasty and it was expensive. ..."


In [305]:
df.shape

(198, 9)

### **Explore data**

#### **One hot encoding after spliting key_words**

In [306]:
# find unique key words from dataset
keys = [i.split(', ') for i in df.key_words]
keys = list({x for l in keys for x in l if x != 'none'}) # also remove 'none' key word

In [307]:
# split key_words to many columns

dic_keys = dict((key, []) for key in keys) # add unique key words as key for dict

# itrate through key_words column 
for i in df.key_words:
  for k in keys:
    # store 1 if key existing in the current key_word row, otherwise, store 0
    if k in i.split(', '):
        dic_keys[k].append(1)
    else:
        dic_keys[k].append(0)

In [308]:
# join new columns in the data
df = df.join(pd.DataFrame(dic_keys))
df.head()

Unnamed: 0.1,Unnamed: 0,shop_name,rating,numbers_of_rating,price,shop_type,key_words,address,reviews,Cosy,Groups,Good for kids,Breakfast,Late-night food,Happy hour food,Casual,Vegetarian options,Outdoor seating,Cash only
0,0,Elixir Bunn Coffee,4.3,218,0,cafe,none,"King Abdullah Rd, حي الحمراء، Riyadh 13215",['Amazing new branch for my favorite coffee ho...,0,0,0,0,0,0,0,0,0,0
1,1,Chamonix Cafe,4.0,563,3,coffee shop,"Late-night food, Breakfast, Outdoor seating","9259 Wadi Al Awsat, Al nbsp;2430, Riyadh",['Its really romantic and lovely place with go...,0,0,0,1,1,0,0,0,1,0
2,2,dr.CAFE COFFEE,4.1,1571,2,coffee shop,"Cosy, Casual, Vegetarian options","As Sulimaniyah, Khurais Road Abi Al Arab Stree...","[""Sandwich wasn't tasty and it was expensive. ...",1,0,0,0,0,0,1,1,0,0
3,3,The Shaky,3.9,52,0,coffee shop,"Cosy, Casual, Groups",لوكاليزر مول بوابة رقم 7 طريق الأمير محمد بن ع...,"['Nice place', ""It's delicious you can build y...",1,1,0,0,0,0,1,0,0,0
4,4,قرمز كافيه - قهوة مختصة,4.1,956,2,coffee shop,"Cosy, Casual, Groups",2659 Dammam Branch Road Al Yarmuk Riyadh 13243...,['This coffee shop is a two story shop with a ...,1,1,0,0,0,0,1,0,0,0


#### **One hot encoding for shop_type**

In [309]:
# convert shop_type to numeric 
df = pd.get_dummies(df, columns=['shop_type'])
df.head(3)

Unnamed: 0.1,Unnamed: 0,shop_name,rating,numbers_of_rating,price,key_words,address,reviews,Cosy,Groups,Good for kids,Breakfast,Late-night food,Happy hour food,Casual,Vegetarian options,Outdoor seating,Cash only,shop_type_cafe,shop_type_coffee shop
0,0,Elixir Bunn Coffee,4.3,218,0,none,"King Abdullah Rd, حي الحمراء، Riyadh 13215",['Amazing new branch for my favorite coffee ho...,0,0,0,0,0,0,0,0,0,0,1,0
1,1,Chamonix Cafe,4.0,563,3,"Late-night food, Breakfast, Outdoor seating","9259 Wadi Al Awsat, Al nbsp;2430, Riyadh",['Its really romantic and lovely place with go...,0,0,0,1,1,0,0,0,1,0,0,1
2,2,dr.CAFE COFFEE,4.1,1571,2,"Cosy, Casual, Vegetarian options","As Sulimaniyah, Khurais Road Abi Al Arab Stree...","[""Sandwich wasn't tasty and it was expensive. ...",1,0,0,0,0,0,1,1,0,0,0,1


#### **Create Target** 

In [310]:
## Add target column
df['quality'] = ['good' if i > 4 else 'bad' for i in df.rating]

In [311]:
# split reviews for the same coffee shop
reviews = []
for i in range(len(df)):
  reviews_per_coffee_shop = str(df.loc[i].reviews).replace('"',"'").split("', '")
  for j in range(len(reviews_per_coffee_shop)):
    # add review and quality to the reviews list
    reviews.append([reviews_per_coffee_shop[j],df.loc[i].quality]) 

In [312]:
# create new dataframe for reviews 
reviews_df = pd.DataFrame.from_records(reviews, columns=['Review','Quality'])
reviews_df.sample(3)

Unnamed: 0,Review,Quality
324,ONE WORD REVIEW = BRILLIANT ...,bad
176,To be honest the coffee here is way better tha...,good
402,Salted caramel ❤️'],good


In [313]:
reviews_df.shape

(562, 2)

#### **Clean data**

In [314]:
def clean_text(text):
  '''
  DESCRIPTION:
  This function to preproccesing text 

  INPUT: 
  text: string

  OUTPUT: 
  text: string after clean it

  ''' 
  text = text.lower() # convert letters to lower case
  text = re.sub("[^a-zA-Z]", " ", text) # remove non-letters
  text = re.sub(r'\d+', '', text) # remove number
  text = re.sub(r'http\S+', '', text) # remove links
  text = text.translate(str.maketrans('','', string.punctuation)) # remove punctuation
  text = re.sub(' +', ' ',text) # remove extra space
  text = text.strip() # remove whitespaces

  text = ' '.join([word for word in text.split() if word not in stopwords.words("english")]) # remove stop words

  # lemmatization returns an actual word of the language
  lemma = nltk.WordNetLemmatizer() # define lemmatizer
  text = ' '.join([lemma.lemmatize(word) for word in text.split()]) 
  
  return text

In [315]:
# The cleaning function applied in all reviews
reviews_df['Cleaned_Review'] = reviews_df['Review'].apply(clean_text)

In [316]:
# get the empty reviews after cleaning procces 
empty_reviews_index = reviews_df[reviews_df.Cleaned_Review == ''].index
print("There are ",len(empty_reviews_index),' empty reviews in the data.')

# siince the empty reviews are few, we drop these rows 
reviews_df.drop(empty_reviews_index, inplace= True)
reviews_df.sample(3)

There are  7  empty reviews in the data.


Unnamed: 0,Review,Quality,Cleaned_Review
51,['Awful coffee and tea,bad,awful coffee tea
435,['One of the best speciality coffee at Riyadh ...,bad,one best speciality coffee riyadh tried latte ...
243,Great coffee and very helpful staff,good,great coffee helpful staff


## **Baseline**

In [317]:
# baseline model
reviews_df.Quality.value_counts(normalize=True)

bad     0.556757
good    0.443243
Name: Quality, dtype: float64

## **Reviews Classifiction model**

### **Data prepration**

#### **Create a bag of words (CountVectorizer)**

[CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) use to convert a collection of text documents to a matrix of token counts.


In [318]:
max_features = 150 # maximum number of features 
count_vector = CountVectorizer(max_features = max_features, stop_words='english')  # create Count Vectorizer
X = count_vector.fit_transform(reviews_df['Cleaned_Review']).toarray() # fit the CountVectorizer using reviews data
X

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [319]:
print("Most using {} words: {} ".format(max_features, count_vector.get_feature_names()))

Most using 150 words: ['amazing', 'area', 'ask', 'atmosphere', 'available', 'average', 'bad', 'bean', 'beautiful', 'best', 'better', 'bit', 'branch', 'bun', 'buy', 'cafe', 'cake', 'came', 'caramel', 'cashier', 'chair', 'chocolate', 'clean', 'coffee', 'cold', 'come', 'comfortable', 'cozy', 'crowded', 'cup', 'customer', 'day', 'deal', 'decoration', 'definitely', 'delicious', 'dessert', 'different', 'drink', 'drive', 'enjoy', 'environment', 'especially', 'espresso', 'excellent', 'expensive', 'experience', 'family', 'fast', 'favorite', 'flat', 'floor', 'food', 'free', 'friend', 'friendly', 'good', 'google', 'great', 'guy', 'helpful', 'high', 'honey', 'hot', 'ice', 'iced', 'inside', 'know', 'latte', 'le', 'like', 'liked', 'little', 'location', 'long', 'lot', 'love', 'loved', 'lovely', 'make', 'meeting', 'menu', 'milk', 'mocha', 'morei', 'morning', 'multiple', 'music', 'nan', 'need', 'new', 'nice', 'offer', 'open', 'option', 'order', 'ordered', 'original', 'outdoor', 'overall', 'parking', 'p



In [320]:
# counts for each word in count_vector
print(count_vector.vocabulary_)

{'amazing': 0, 'new': 90, 'branch': 12, 'favorite': 49, 'coffee': 23, 'recommended': 111, 'like': 70, 'different': 37, 'cafe': 15, 'drink': 38, 'cold': 24, 'good': 56, 'nice': 91, 'staff': 125, 'work': 149, 'excellent': 44, 'cup': 29, 'morning': 85, 'sure': 130, 'really': 109, 'lovely': 78, 'place': 103, 'food': 52, 'enjoy': 40, 'little': 72, 'friendly': 55, 'price': 104, 'come': 25, 'great': 58, 'service': 117, 'sandwich': 113, 'tasty': 134, 'expensive': 45, 'friend': 54, 'multiple': 86, 'type': 141, 'comfortable': 26, 'decoration': 33, 'shop': 118, 'riyadh': 112, 'variety': 143, 'hot': 63, 'cake': 16, 'serve': 116, 'delicious': 35, 'translated': 138, 'google': 57, 'small': 121, 'original': 97, 'spacious': 122, 'area': 1, 'family': 47, 'single': 119, 'section': 115, 'ordered': 96, 'time': 137, 'experience': 46, 'atmosphere': 3, 'best': 9, 'bad': 6, 'beautiful': 8, 'latte': 68, 'dessert': 36, 'flat': 50, 'white': 148, 'option': 94, 'spanish': 123, 'try': 140, 'sweet': 131, 'crowded': 2

In [321]:
d = pd.DataFrame(X,columns=count_vector.get_feature_names())
d

Unnamed: 0,amazing,area,ask,atmosphere,available,average,bad,bean,beautiful,best,...,try,type,usually,variety,visit,visited,waiter,way,white,work
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
550,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
551,0,0,0,0,0,0,1,0,0,2,...,0,0,0,0,0,0,0,0,0,0
552,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
553,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### **Split data to train and test**
Pretty obvious that the random_state is 42, which is the [Answer to the Ultimate Question of Life, the Universe, and Everything](https://en.wikipedia.org/wiki/42_(number)#Popular_culture). :D :D

In [322]:
X_train, X_test, y_train, y_test = train_test_split(X, reviews_df['Quality'], test_size =0.2, random_state=42)

### **Logistic Regression**

In [323]:
# Define Logistic Regression
model = LogisticRegression()

# train model
model.fit(X_train, y_train) 

#### **Results**

In [324]:
# Predicting the Train set results 
print('Train model accuracy: ', accuracy_score(y_train, model.predict(X_train)))

Train model accuracy:  0.7657657657657657


In [325]:
# Predicting the Test set results 
y_pred = model.predict(X_test) 
print('Test model accuracy: ', accuracy_score(y_test, y_pred))

Test model accuracy:  0.6756756756756757


In [326]:
print(style.BOLD,"Predict 'Overall Quality' using reviews", style.END)
print("The classification report for Logistic Regression model")
print('-------------------------------------------------------\n')
print(classification_report(y_test, y_pred))

[1m Predict 'Overall Quality' using reviews [0m
The classification report for Logistic Regression model
-------------------------------------------------------

              precision    recall  f1-score   support

         bad       0.72      0.76      0.74        67
        good       0.60      0.55      0.57        44

    accuracy                           0.68       111
   macro avg       0.66      0.65      0.66       111
weighted avg       0.67      0.68      0.67       111



## **Overall shop quality prediction using features**

#### **Split data to train and test**

In [327]:
df.sample(3).T

Unnamed: 0,40,100,192
Unnamed: 0,40,100,192
shop_name,Capio Diem,STARBUCKS,Wayne's Coffee
rating,4.2,4.0,4.7
numbers_of_rating,629,286,3
price,2,2,0
key_words,"Cosy, Casual, Groups",none,"Cosy, Casual"
address,7343 ابي بكر الصديق، 2284، Riyadh 13313 2284,برج الفيصلية -طريق العليا - حي الاميرية، Riyad...,"Takhassusi St, المعذر الشمالي،, Al، Riyadh 12332"
reviews,['Enjoyable place with a lot of board games. N...,"[""I really love this place. It's comfy and cro...",['(Translated by Google) Great place to drink ...
Cosy,1,0,1
Groups,1,0,0


In [330]:
X_ = df[['numbers_of_rating', 'price','shop_type_cafe', 'shop_type_coffee shop','Breakfast', 
       'Cash only', 'Vegetarian options', 'Cosy', 'Late-night food', 'Casual',
       'Outdoor seating', 'Groups', 'Happy hour food', 'Good for kids']]

y_ = df['quality']

In [331]:
X_train_, X_test_, y_train_, y_test_ = train_test_split(X_,y_, test_size =0.2, random_state=42)


### **Logistic Regression**

In [332]:
# Define Logistic Regression
#model_ = LogisticRegression()

model_ = RandomForestClassifier()

# train model
model_.fit(X_train_, y_train_) 



#### **Results**

In [333]:
# Predicting the Train set results 
print('Train model accuracy: ', accuracy_score(y_train_, model_.predict(X_train_)))

Train model accuracy:  0.9810126582278481


In [334]:
# Predicting the Test set results 
y_pred_ = model_.predict(X_test_) 
print('Test model accuracy: ', accuracy_score(y_test_, y_pred_))

Test model accuracy:  0.5


In [335]:
print(style.BOLD,"Predict 'Overall Quality' using features",style.END)
print("The classification report for Logistic Regression model")
print('-------------------------------------------------------\n')
print(classification_report(y_test_, y_pred_))

[1m Predict 'Overall Quality' using features [0m
The classification report for Logistic Regression model
-------------------------------------------------------

              precision    recall  f1-score   support

         bad       0.56      0.61      0.58        23
        good       0.40      0.35      0.38        17

    accuracy                           0.50        40
   macro avg       0.48      0.48      0.48        40
weighted avg       0.49      0.50      0.49        40



## **Remarks**

Here are the scope for improvements:
- Overfitting can be seen and hence, increase dataset.
- Add more features.
- Apply differant techniques for texts vectoriztion.