In [10]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import re


### Load the 'business' dataset

In [11]:
df2 = pd.read_json("yelp_academic_dataset_business.json", chunksize = 1000, lines = True)
drop_cols = ['address', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open','attributes', 'hours']

* Deleting non useful columns

In [12]:
import time
start = time.time()
chunks = []
a = 0
for chunk in df2:
    a += 1
    chunk_b = chunk.drop(drop_cols, axis = 1)
    restas = chunk_b[chunk_b['categories'].str.contains('restaurant', case = False, na = False)]
    chunks.append(restas)
restaurants = pd.concat(chunks, ignore_index= True, join='outer')
end = time.time()
elapsed = end-start

In [13]:
restaurants.shape

(63961, 4)

## Load the reviews dataset
* Remember we made the merge to use ONLY restaurants data, because there were data from other things

In [14]:
reviews_raw = pd.read_json("yelp_academic_dataset_review.json", chunksize=100000, lines = True)
drop_cols = ['review_id', 'user_id','useful', 'funny', 'cool', 'date']

* Using merge instead of join because we want to join in another column other than the index

In [253]:
a = 0
for chunk in reviews_raw:
    a += 1
    reviews = chunk.drop(drop_cols, axis = 1)
    data = restaurants.merge(reviews, left_on = 'business_id', right_on = 'business_id',how = 'inner')
    if a == 1:
        break

### Finally the data to be preprocessed (the "text" column, to be exact) 
TBD:
* Delete all number 3 i.e neutral 
* Same number of positive as negatives
* Shuffle the data

In [254]:
data.head()

Unnamed: 0,business_id,name,city,categories,stars,text
0,lu7vtrp_bE9PnxWfA8g4Pg,Banzai Sushi,Thornhill,"Japanese, Fast Food, Food Court, Restaurants",5,"Great Sushi, and unbeatable prices! Only downf..."
1,lu7vtrp_bE9PnxWfA8g4Pg,Banzai Sushi,Thornhill,"Japanese, Fast Food, Food Court, Restaurants",3,I don't listen to my father often when it come...
2,vjTVxnsQEZ34XjYNS-XUpA,Wetzel's Pretzels,Phoenix,"Food, Pretzels, Bakeries, Fast Food, Restaurants",4,"Never heard of the cheese meltdown pretzel, bu..."
3,vjTVxnsQEZ34XjYNS-XUpA,Wetzel's Pretzels,Phoenix,"Food, Pretzels, Bakeries, Fast Food, Restaurants",4,"PV Mall's food court needs updating, but that ..."
4,fnZrZlqW1Z8iWgTVDfv_MA,Carl's Jr,Las Vegas,"Mexican, Restaurants, Fast Food",3,I haven't tried much on their menu but their c...


## Get a toyset to work in trials from here

In [258]:
data = data.loc[:, ['name', 'stars', 'text']]
data.shape

(66748, 3)

* Some graphs to know the number of reviews by ranking

* Delete the number 3's

In [259]:
data.drop(data[data['stars'] == 3].index, inplace = True)
data = data.reset_index(drop = True)

* We want just the reviews with 4-5 to be positive and the 1-2 to be negative, we do that on the following

In [261]:
data['Sentiment'] = data['stars'].apply(lambda x: 1 if x > 3 else 0)
data['Sentiment'].value_counts()

1    44050
0    13951
Name: Sentiment, dtype: int64

### -------------------------------------------------------------------------------------

### Use "3" as negative to augment the data and  balance the classes? (optional)
One could argue that the reviews that have not unconditionally positive

In [262]:
neg_half = list((data[data['stars'] <=3]).index)
pos_half = list(set(range(data.shape[0])) - set(neg_half))
pos_half = pos_half[0:len(neg_half)]
pos_half.extend(neg_half)
dropper = list(set(range(data.shape[0])) - set(pos_half))
data.drop(index = dropper, inplace = True)
data['Sentiment'] = data['stars'].apply(lambda x: 1 if x > 3 else 0)

In [263]:
data['Sentiment'].value_counts()

1    13951
0    13951
Name: Sentiment, dtype: int64

### -------------------------------------------------------------------------------------

## From here start the NLP pipeline

The goal here is to make a bag of words, it can be done manually, but also with sklearn.

**Steps:** I am trying to extract the nest tokens with the tokenizer from Potts and then feed that already "clean tokens" to the vectorizer

### PoC for NLTK

In [22]:
raw = data2['text'][1:3]
raw
raw2 = data2['text'][1]

In [23]:
from nltk.probability import FreqDist
counter = FreqDist()

In [24]:
tokenizer = []
a = 0
for i in raw2:
    words = i.lower()
    words = words.split()
    for word in words:
        a += 1
        counter[word] += 1
        if word not in tokenizer:
            tokenizer.append(word)
len(tokenizer)
tokenizer
words = raw2.split()
for word in words:
    a += 1
len(words)

94

### For the negation tagging, put the negation until ^[.:;!?]$ (until the punctuation mark)

In [25]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [26]:
#out of the box tokenizer and counter
from nltk.probability import FreqDist
counter = FreqDist()
tokens_nltk = word_tokenize(raw2)

tokenizer = []
for word in tokens_nltk:
    counter[word.lower()] += 1

len(tokens_nltk)
len(counter)
counter['the']

3

In [235]:
#counter.B() #is the number of unique words (?)
#x = counter.N #is the number of words
#counter.most_common(20)

In [28]:
stop = set(stopwords.words("english"))

# -------------------------------------------------------------

### Trials for regex

TBD:
### Check the paper that is mentioned in notion by UCLondon

* Delete all the reviews that are not in english
* Same number of negative that as positive reviews for the sets
* Tokenization and BoW creation (**BoW with frequency or with presence?**)
* unigrams and bigrams
* lower the case
* POS tagging?
* EDA como en potts con las palabras mas frecuentes en positives and negatives reviews
* Handling negation? _NOT or with sentiment negative scoring?
* See the book by Bing Lui for how to identify fake news, resonates with anomaly detection and identifying if a review is fake

In [29]:
string = """Hello my Number is 123456789 and  
             my friend's number is 987654321, and my number is also 987654321"""
    
# A sample regular expression to find digits.  
regex = '\d+'             
    
match = re.findall(regex, string)  
print(match)
raw2

['123456789', '987654321', '987654321']


"Been coming here since I was in grade 9 so about 10 years now (wow!) staff are very friendly and prices are ridiculously cheap. I remember back in my younger days being short on change and they never cared! Super nice family owned businesses. I always get the California roll, either I grab one out of the fridge or have them make it fresh if there's none. The tofu is also really good! They also sell so many different kinds of pop in a can for a buck and different Asian treats like pocky!"

# ---------------------------------------------------

## Negation tagging Function

In [30]:
def pesimist(text):
    x = text
    x = x.split()
    hasta = 0
    desde = 0
    c = 0
    j = 0
    passer = False
    for k in range(len(x)):
        #print(j)
        #print(k)
        #if not passer:
            #continue
        passer = True

        i = x[k]
        negation_string = r"""
        ^(?:never|no|nothing|nowhere|noone|none|not|
            havent|hasnt|hadnt|cant|couldnt|shouldnt|
            wont|wouldnt|but|doesnt|didnt|isnt|arent|aint
        )$|n't
        """
        #print("first")
        c +=1
        #match = re.search(r'\bthe\b',i)
        neg = re.compile(negation_string, re.VERBOSE | re.I | re.UNICODE)
        match = neg.findall(i)
        #print(i + "i")
        #print('desde', desde)
        #print(f'hasta menos desde es: {hasta - desde} y c es {c}')
        if c < (hasta - desde):
            continue
        if match:
            c = 0
            desde = k + 1
            #print(c - 1)
            #j = i
            jump = k
            for j in range(jump, 100):

                #print(j)
                try:
                    comma = re.search(r'[.:;!?]', x[j])
                    #print("second")
                    if comma:
                        hasta = j + 1
                        c += 1
                        
                        #this try to avoid error if there is no punctuation error before the phrase ends
                        try:
                            for i in range(desde, hasta):
                                repl = re.match(r'\w+', x[i])
                                x[i] = repl.group() + "_NOT"
                            
                            c = 0
                            break
                        except:
                            #print(' '.join(x))
                            c = 0
                            break
                except:
                    pass
        if match:
            continue



                #continue


            #else:
                #continue
            #break
    xx = ' '.join(x)
    return xx

## *Already done*
    1. Identify all the negation words on the regular expresion, can be taken from one paper.
    2. Implement it i conjuction with the tokenizer and the stop words removal
    3. run it for all the dataset
    4. Balance the positive and negative classes on the data set that we are going to take to make all the trials.
    5. Finish the identification of features
    


## Current status

order of the pipeline (in **bold** what is already done)

(balance the sample 50/50 in reviews)
1. **Negator**
2. **Tokenizer**
3. **Stop words removal (kind of done, have to figure it out)**
3. **BoW**

*Follows: Select the features and extend the functionality for all the reviews*
1. bigram
2. Positive Tokens
3. Negative Tokens

## Negation tagging

In [264]:
trial = data.copy()

In [265]:
#Apply the function of negation taggin to each row
trial["sample"] = trial.loc[:, "text"].apply(pesimist)

In [266]:
trial.head()

Unnamed: 0,name,stars,text,Sentiment,sample
0,Banzai Sushi,5,"Great Sushi, and unbeatable prices! Only downf...",1,"Great Sushi, and unbeatable prices! Only downf..."
1,Wetzel's Pretzels,4,"Never heard of the cheese meltdown pretzel, bu...",1,Never heard_NOT of_NOT the_NOT cheese_NOT melt...
2,Wetzel's Pretzels,4,"PV Mall's food court needs updating, but that ...",1,"PV Mall's food court needs updating, but that_..."
3,Carl's Jr,4,I'm a Carl's Jr fan for their fried zucchini a...,1,I'm a Carl's Jr fan for their fried zucchini a...
4,Carl's Jr,4,Customer service has been top notch on every v...,1,Customer service has been top notch on every v...


## Tokenizer


In [267]:
from Utils.happyfuntokenizing import Tokenizer

In [268]:
tok = Tokenizer()

In [269]:
tok = Tokenizer()
tok.__dict__

{'preserve_case': False, 'all_in': False}

In [270]:
trial["tokens"] = trial.loc[:, "sample"].apply(tok.tokenize)
trial.reset_index(inplace = True)

In [271]:
trial["Sentiment"].value_counts()

1    13951
0    13951
Name: Sentiment, dtype: int64

## Removing 'filler' words
Counting the most common words over all the corpora we can tell that the most common ones, and thus, tne ones that may play a bid role on the classification, are not meaningful for us to discover which words are really expressing a positive or negative emotion, thus, the first 15 words are going to be removed from the corpus.

In the following cells, the most common words are shown

In [337]:
reviews = trial['tokens'].to_numpy()
review1 = reviews[0]

In [341]:
reviews[0]

['great',
 'sushi',
 'and',
 'unbeatable',
 'prices',
 'only',
 'downfall',
 'is',
 'that',
 'they',
 'are',
 'cash',
 'only',
 'and',
 'close',
 'by',
 '7pm']

In [296]:
def most_common_words(raw_reviews, w_number):
    """
    Creates a list with the most 'w_number' (number) of words on the whole corpus
    ---------
    raw_reviews : all the column of the raw reviews
    w_number = number of most common words that wish to be extracted
    """
    
    commons = []
    for word in counting:
        commons.append(word[0])
        if len(commons) == w_number:
            break
    return commons, counter
cc, count = most_common_words(reviews, 9)


In [334]:
a= count.most_common()
print(a[0:1000])

[('the', 144056), ('and', 97062), ('i', 70720), ('a', 65661), ('to', 61725), ('was', 57025), ('of', 39427), ('it', 35423), ('for', 32693), ('is', 31724), ('we', 29998), ('in', 27953), ('my', 23342), ('this', 23075), ('with', 22284), ('that', 22265), ('but', 21756), ('they', 20238), ('food', 19773), ('not', 19668), ('on', 18536), ('were', 17890), ('had', 17876), ('you', 16784), ('have', 15084), ('so', 14639), ('the_not', 13974), ('at', 13970), ('place', 13797), ('good', 13268), ('our', 11915), ('are', 11839), ('be', 11175), ('service', 10710), ('as', 10688), ('there', 10300), ('very', 10113), ('out', 9576), ('great', 9481), ('just', 9248), ('if', 9199), ('here', 9143), ('like', 8918), ('all', 8813), ('to_not', 8510), ('me', 8475), ('time', 8220), ('one', 8210), ('when', 7755), ('back', 7569), ('their', 7394), ('would', 7348), ('ordered', 7247), ('get', 7073), ('no', 7064), ('from', 7054), ('a_not', 6823), ('up', 6804), ('which', 6649), ('...', 6630), ('and_not', 6627), ('us', 6584), ('w

In [351]:
def restricted_corpus_builder(corpus_size, raw_reviews):
    """
    Create the bag of words with the first 'corpus_size' most common words present on the reviews, 
    n
    -----------
    corpus_size : number of words to be included on de bow
    raw_reviews : all the column of the raw reviews
    """
    bow = []
    bow_counter = []
    counter = FreqDist()
    for review in raw_reviews:
        for word in review:
            counter[word] += 1
    counting = counter.most_common(corpus_size)
    for word in counting:
        bow.append(word[0])
        bow_counter.append(word[1])
    return bow, bow_counter  
bow, bow_counted = restricted_corpus_builder(1500, reviews)

## Bag of words
Using the tokens, of course
* Make the corpora
* Make the vectors with word presence/frecuency. I think presenc may be better for vector-space representation

TBD: check why is not working when the number of ommited words is omre than 1

In [288]:
def corpus_builder(raw_reviews, most_common):
    """
    Create the bag of words of all the words present on the reviews, ommiting the 'most_common' words as they are 
    conseidered as fillers with low influence on the classification
    -----------
    raw_reviews : all the column of the raw reviews
    most_common : list of most common words that wish to be ommited
    """
    main_corpus = []
    for review in raw_reviews:
        for word in review:
            if word in most_common:
                continue
            if word not in main_corpus:
                main_corpus.append(word)
    return main_corpus
#bow = corpus_builder(reviews, cc)
#len(bow)

In [355]:
def vectorizer(bow, ind_review, ommitted_words):
    """
    Vectorize the review based on the counting of the words present on the review, 
    the appereances are counted on a dictionary, the value of the words (keys) that are not present remain as '0'
    ----------
    bow : bag of all the words in list (unique values)
    vector: individual review tokenized on the form of a list
    """
    index = 0
    counter = dict.fromkeys(bow, 0)
    for word in ind_review:
        try:
            index += 1
            if word in ommitted_words:
                continue
            else:
                counter[word] += 1
        except:
            pass
    return list(counter.values()), index
#vv = vectorizer(bow, review1, cc)

In [356]:
def matrix_builder(bow, review_series, ommited_words):
    """
    Creates the matrix of features based on the term frecuency vectors created by the function vectorizer
    ----------
    review_series : pandas object (series)  i.e dataframe[name_of_column]
    """
    X = []
    i = 0
    
    for review in review_series:
        word_vector, index = vectorizer(bow, review, ommited_words)
        if i < 1:
            X = np.array([word_vector])
            i += 1
            continue
        X = np.append(X, [word_vector], axis = 0)
            
    return X

### Pipeline for the creation of the feature matrix
* features thus far: 
    * unigram i.e. word frecuency 

In [273]:
reviews = trial['tokens'].to_numpy()
common = most_common_words(reviews, 9)
bow = corpus_builder(reviews, common)
raw_reviews = trial['tokens']
X = matrix_builder(bow, raw_reviews, common)
X.shape

KeyError: 'is'

In [364]:
raw_reviews = trial['tokens']
raw_reviews = raw_reviews[0:1000]

In [365]:
common = []
X = matrix_builder(bow, raw_reviews, common)
X.shape

(1000, 1500)

array([[0, 2, 0, ..., 0, 0, 0],
       [2, 1, 0, ..., 0, 0, 0],
       [5, 2, 3, ..., 0, 0, 0],
       ...,
       [8, 3, 5, ..., 0, 0, 0],
       [2, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [381]:
target_vector = trial['Sentiment'].to_numpy()
target = np.asarray([target_vector[0:1000]])
target.shape[1]

1000

In [385]:
target = np.reshape(target, ([target.shape[1],-1]))
target.shape

(1000, 1)

AttributeError: 'list' object has no attribute 'keys'