In [10]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import re


### Load the 'business' dataset

In [11]:
df2 = pd.read_json("yelp_academic_dataset_business.json", chunksize = 1000, lines = True)
drop_cols = ['address', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open','attributes', 'hours']

* Deleting non useful columns

In [12]:
import time
start = time.time()
chunks = []
a = 0
for chunk in df2:
    a += 1
    chunk_b = chunk.drop(drop_cols, axis = 1)
    restas = chunk_b[chunk_b['categories'].str.contains('restaurant', case = False, na = False)]
    chunks.append(restas)
restaurants = pd.concat(chunks, ignore_index= True, join='outer')
end = time.time()
elapsed = end-start

In [13]:
restaurants.shape

(63961, 4)

## Load the reviews dataset
* Remember we made the merge to use ONLY restaurants data, because there were data from other things

In [14]:
reviews_raw = pd.read_json("yelp_academic_dataset_review.json", chunksize=100000, lines = True)
drop_cols = ['review_id', 'user_id','useful', 'funny', 'cool', 'date']

* Using merge instead of join because we want to join in another column other than the index

In [212]:
a = 0
for chunk in reviews_raw:
    a += 1
    reviews = chunk.drop(drop_cols, axis = 1)
    data = restaurants.merge(reviews, left_on = 'business_id', right_on = 'business_id',how = 'inner')
    if a == 1:
        break

### Finally the data to be preprocessed (the "text" column, to be exact) 
TBD:
* Delete all number 3 i.e neutral 
* Same number of positive as negatives
* Shuffle the data

In [213]:
data.head()

Unnamed: 0,business_id,name,city,categories,stars,text
0,pQeaRpvuhoEqudo3uymHIQ,The Empanadas House,Champaign,"Ethnic Food, Food Trucks, Specialty Food, Impo...",5,I ordered feta cheese and spinach empanadas an...
1,CsLQLiRoafpJPJSkNX2h5Q,Middle East Deli,Charlotte,"Food, Restaurants, Grocery, Middle Eastern",2,"I've eaten at this location since 1997/98, so ..."
2,vjTVxnsQEZ34XjYNS-XUpA,Wetzel's Pretzels,Phoenix,"Food, Pretzels, Bakeries, Fast Food, Restaurants",5,Wetzels's Pretzels is definitely amazing and v...
3,fnZrZlqW1Z8iWgTVDfv_MA,Carl's Jr,Las Vegas,"Mexican, Restaurants, Fast Food",1,WORST experience EVER!!!!! never have i ate an...
4,fnZrZlqW1Z8iWgTVDfv_MA,Carl's Jr,Las Vegas,"Mexican, Restaurants, Fast Food",4,Hot fresh food usually. Staff seems to turn o...


## Get a toyset to work in trials from here

In [214]:
data = data.loc[:, ['name', 'stars', 'text']]
data.shape

(66729, 3)

* Some graphs to know the number of reviews by ranking

* Delete the number 3's

In [230]:
data.drop(data[data['stars'] == 3].index, inplace = True)
data = data.reset_index(drop = True)

* We want just the reviews with 4-5 to be positive and the 1-2 to be negative, we do that on the following

In [233]:
data['Sentiment'] = data['stars'].apply(lambda x: 1 if x > 3 else 0)
data['Sentiment'].value_counts()

1    44431
0    13695
Name: Sentiment, dtype: int64

### -------------------------------------------------------------------------------------

### Use "3" as negative to augment the data and  balance the classes? (optional)
One could argue that the reviews that have not unconditionally positive

In [20]:
data3 = data2.copy()
neg_half = list((data3[data3['stars'] <=3]).index)
pos_half = list(set(range(data3.shape[0])) - set(neg_half))
pos_half = pos_half[0:len(neg_half)]
pos_half.extend(neg_half)
dropper = list(set(range(data3.shape[0])) - set(pos_half))
data3.drop(index = dropper, inplace = True)
data3['Sentiment'] = data3['stars'].apply(lambda x: 1 if x > 3 else 0)

In [21]:
data3['Sentiment'].value_counts()
data2 = data3.copy()

### -------------------------------------------------------------------------------------

## From here start the NLP pipeline

The goal here is to make a bag of words, it can be done manually, but also with sklearn.

**Steps:** I am trying to extract the nest tokens with the tokenizer from Potts and then feed that already "clean tokens" to the vectorizer

### PoC for NLTK

In [22]:
raw = data2['text'][1:3]
raw
raw2 = data2['text'][1]

In [23]:
from nltk.probability import FreqDist
counter = FreqDist()

In [24]:
tokenizer = []
a = 0
for i in raw2:
    words = i.lower()
    words = words.split()
    for word in words:
        a += 1
        counter[word] += 1
        if word not in tokenizer:
            tokenizer.append(word)
len(tokenizer)
tokenizer
words = raw2.split()
for word in words:
    a += 1
len(words)

94

### For the negation tagging, put the negation until ^[.:;!?]$ (until the punctuation mark)

In [25]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [26]:
#out of the box tokenizer and counter
from nltk.probability import FreqDist
counter = FreqDist()
tokens_nltk = word_tokenize(raw2)

tokenizer = []
for word in tokens_nltk:
    counter[word.lower()] += 1

len(tokens_nltk)
len(counter)
counter['the']

3

In [235]:
#counter.B() #is the number of unique words (?)
#x = counter.N #is the number of words
#counter.most_common(20)

In [28]:
stop = set(stopwords.words("english"))

# -------------------------------------------------------------

### Trials for regex

TBD:
### Check the paper that is mentioned in notion by UCLondon

* Delete all the reviews that are not in english
* Same number of negative that as positive reviews for the sets
* Tokenization and BoW creation (**BoW with frequency or with presence?**)
* unigrams and bigrams
* lower the case
* POS tagging?
* EDA como en potts con las palabras mas frecuentes en positives and negatives reviews
* Handling negation? _NOT or with sentiment negative scoring?
* See the book by Bing Lui for how to identify fake news, resonates with anomaly detection and identifying if a review is fake

In [29]:
string = """Hello my Number is 123456789 and  
             my friend's number is 987654321, and my number is also 987654321"""
    
# A sample regular expression to find digits.  
regex = '\d+'             
    
match = re.findall(regex, string)  
print(match)
raw2

['123456789', '987654321', '987654321']


"Been coming here since I was in grade 9 so about 10 years now (wow!) staff are very friendly and prices are ridiculously cheap. I remember back in my younger days being short on change and they never cared! Super nice family owned businesses. I always get the California roll, either I grab one out of the fridge or have them make it fresh if there's none. The tofu is also really good! They also sell so many different kinds of pop in a can for a buck and different Asian treats like pocky!"

# ---------------------------------------------------

## Negation tagging Function

In [30]:
def pesimist(text):
    x = text
    x = x.split()
    hasta = 0
    desde = 0
    c = 0
    j = 0
    passer = False
    for k in range(len(x)):
        #print(j)
        #print(k)
        #if not passer:
            #continue
        passer = True

        i = x[k]
        negation_string = r"""
        ^(?:never|no|nothing|nowhere|noone|none|not|
            havent|hasnt|hadnt|cant|couldnt|shouldnt|
            wont|wouldnt|but|doesnt|didnt|isnt|arent|aint
        )$|n't
        """
        #print("first")
        c +=1
        #match = re.search(r'\bthe\b',i)
        neg = re.compile(negation_string, re.VERBOSE | re.I | re.UNICODE)
        match = neg.findall(i)
        #print(i + "i")
        #print('desde', desde)
        #print(f'hasta menos desde es: {hasta - desde} y c es {c}')
        if c < (hasta - desde):
            continue
        if match:
            c = 0
            desde = k + 1
            #print(c - 1)
            #j = i
            jump = k
            for j in range(jump, 100):

                #print(j)
                try:
                    comma = re.search(r'[.:;!?]', x[j])
                    #print("second")
                    if comma:
                        hasta = j + 1
                        c += 1
                        
                        #this try to avoid error if there is no punctuation error before the phrase ends
                        try:
                            for i in range(desde, hasta):
                                repl = re.match(r'\w+', x[i])
                                x[i] = repl.group() + "_NOT"
                            
                            c = 0
                            break
                        except:
                            #print(' '.join(x))
                            c = 0
                            break
                except:
                    pass
        if match:
            continue



                #continue


            #else:
                #continue
            #break
    xx = ' '.join(x)
    return xx

## Finally the negation tagging is working, next steps:
    1. Identify all the negation words on the regular expresion, can be taken from one paper.
    2. Implement it i conjuction with the tokenizer and the stop words removal
    3. run it for all the dataset
    4. Balance the positive and negative classes on the data set that we are going to take to make all the trials.
    5. Finish the identification of features

## Current status

order of the pipeline (in **bold** what is already done)

(balance the sample 50/50 in reviews)
1. **Negator**
2. **Tokenizer**
3. **Stop words removal (kind of done, have to figure it out)**
3. **BoW**

*Follows: Select the features and extend the functionality for all the reviews*
1. bigram
2. Positive Tokens
3. Negative Tokens

## Negation tagging

In [238]:
trial = data.copy()

In [239]:
#Apply the function of negation taggin to each row
trial["sample"] = trial.loc[:, "text"].apply(pesimist)

In [242]:
trial.head()

Unnamed: 0,name,stars,text,Sentiment,sample
0,The Empanadas House,5,I ordered feta cheese and spinach empanadas an...,1,I ordered feta cheese and spinach empanadas an...
1,Middle East Deli,2,"I've eaten at this location since 1997/98, so ...",0,"I've eaten at this location since 1997/98, so ..."
2,Wetzel's Pretzels,5,Wetzels's Pretzels is definitely amazing and v...,1,Wetzels's Pretzels is definitely amazing and v...
3,Carl's Jr,1,WORST experience EVER!!!!! never have i ate an...,0,WORST experience EVER!!!!! never have_NOT i_NO...
4,Carl's Jr,4,Hot fresh food usually. Staff seems to turn o...,1,Hot fresh food usually. Staff seems to turn ov...


## Tokenizer


In [243]:
from Utils.happyfuntokenizing import Tokenizer

In [244]:
tok = Tokenizer()

In [245]:
tok = Tokenizer()
tok.__dict__

{'preserve_case': False, 'all_in': False}

In [246]:
trial["tokens"] = trial.loc[:, "sample"].apply(tok.tokenize)
trial.reset_index(inplace = True)

In [247]:
trial["Sentiment"].value_counts()

1    44431
0    13695
Name: Sentiment, dtype: int64

## Removing 'filler' words
Counting the most common words over all the corpora we can tell that the most common ones, and thus, tne ones that may play a bid role on the classification, are not meaningful for us to discover which words are really expressing a positive or negative emotion, thus, the first 15 words are going to be removed from the corpus.

In the following cells, the most common words are shown

In [184]:
reviews = trial['tokens'].to_numpy()
review1 = reviews[0]

In [177]:
def most_common_words(raw_reviews, w_number):
    """
    Creates a list with the most 'w_number' (number) of words on the whole corpus
    ---------
    raw_reviews : all the column of the raw reviews
    w_number = number of most common words that wish to be extracted
    """
    counter = FreqDist()
    for review in raw_reviews:
        for word in review:
            counter[word.lower()] += 1
    counting = counter.most_common(w_number)
    commons = []
    for word in counting:
        commons.append(word[0])
        if len(commons) == w_number:
            break
    return commons
cc = most_common_words(reviews, 9)

## Bag of words
Using the tokens, of course
* Make the corpora
* Make the vectors with word presence/frecuency. I think presenc may be better for vector-space representation

TBD: check why is not working when the number of ommited words is omre than 1

In [192]:
def corpus_builder(raw_reviews, most_common):
    """
    Create the bag of words of all the words present on the reviews, ommiting the 'most_common' words as they are 
    conseidered as fillers with low influence on the classification
    -----------
    raw_reviews : all the column of the raw reviews
    most_common : list of most common words that wish to be ommited
    """
    main_corpus = []
    for review in raw_reviews:
        for word in review:
            if word in most_common:
                continue
            if word not in main_corpus:
                main_corpus.append(word)
    return main_corpus
bow = corpus_builder(reviews, cc)
len(bow)

1700

In [198]:
def vectorizer(bow, ind_review, ommitted_words):
    """
    Vectorize the review based on the counting of the words present on the review, 
    the appereances are counted on a dictionary, the value of the words (keys) that are not present remain as '0'
    ----------
    bow : bag of all the words in list (unique values)
    vector: individual review tokenized on the form of a list
    """
    counter = dict.fromkeys(bow, 0)
    for word in ind_review:
        if word in ommitted_words:
            continue
        else:
            counter[word] += 1
    return list(counter.values())
vv = vectorizer(bow, review1, cc)

1700

In [202]:
def matrix_builder(bow, review_series, ommited_words):
    """
    Creates the matrix of features based on the term frecuency vectors created by the function vectorizer
    ----------
    review_series : pandas object (series)  i.e dataframe[name_of_column]
    """
    X = []
    i = 0
    
    for review in review_series:
        word_vector = vectorizer(bow, review, ommited_words)
        if i < 1:
            X = np.array([word_vector])
            i += 1
            continue
        X = np.append(X, [word_vector], axis = 0)
    return X

### Pipeline for the creation of the feature matrix
* features thus far: 
    * word frecuency

In [203]:
reviews = trial['tokens'].to_numpy()
common = most_common_words(reviews, 9)
bow = corpus_builder(reviews, most_common_in_corpus)
raw_reviews = trial['tokens']
X = matrix_builder(bow, raw_reviews, common)
X.shape

(58, 1700)

In [157]:
most_common_words(reviews, 10)

['the', 'and', 'i', 'a', 'to', 'was', 'of', 'in', 'is', 'it']

1694

[('the', 323),
 ('and', 201),
 ('i', 179),
 ('a', 131),
 ('to', 130),
 ('was', 106),
 ('of', 90),
 ('in', 81),
 ('is', 71),
 ('it', 66),
 ('this', 62),
 ('for', 61),
 ('we', 58),
 ('my', 54),
 ('but', 49),
 ('that', 48),
 ('place', 46),
 ('food', 43),
 ('they', 42),
 ('on', 42),
 ('with', 36),
 ('not', 36),
 ('at', 35),
 ('have', 34),
 ('are', 33)]

In [94]:
help(FreqDist)

Help on class FreqDist in module nltk.probability:

class FreqDist(collections.Counter)
 |  FreqDist(samples=None)
 |  
 |  A frequency distribution for the outcomes of an experiment.  A
 |  frequency distribution records the number of times each outcome of
 |  an experiment has occurred.  For example, a frequency distribution
 |  could be used to record the frequency of each word type in a
 |  document.  Formally, a frequency distribution can be defined as a
 |  function mapping from each sample to the number of times that
 |  sample occurred as an outcome.
 |  
 |  Frequency distributions are generally constructed by running a
 |  number of experiments, and incrementing the count for a sample
 |  every time it is an outcome of an experiment.  For example, the
 |  following code will produce a frequency distribution that encodes
 |  how often each word occurs in a text:
 |  
 |      >>> from nltk.tokenize import word_tokenize
 |      >>> from nltk.probability import FreqDist
 |      >

In [99]:
counter.plot(samples=counter.most_common(20).keys())

AttributeError: 'list' object has no attribute 'keys'