# NLP: Analyzing Review Text


### The objective is to be able to extract the positive or negative sentiment and gain insight from review text using Yelp review data.

## Download and parse the data


To start, let's download the data set from Amazon S3:

In [3]:
!aws s3 sync s3://mldata/ . --exclude '*' --include 'yelp_train_academic_dataset_review_reduced.json.gz'

In [4]:
import gzip
import ujson as json

with gzip.open('yelp_train_academic_dataset_review_reduced.json.gz') as f:
    data = [json.loads(line) for line in f]

In [5]:
stars = [row['stars'] for row in data]

### First, I build a linear model predicting the star rating based on the text reviews (Ridge Regression)


In [6]:
from sklearn.base import BaseEstimator
import pandas as pd
from sklearn.base import TransformerMixin

class ToText(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return [row['text'] for row in X]

In [66]:
#testing

In [7]:
to_text = ToText()


In [None]:
new = to_text.fit_transform(data)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english', min_df=20, max_df=0.4)

cv.fit_transform(new)

<253272x20168 sparse matrix of type '<class 'numpy.int64'>'
	with 11993261 stored elements in Compressed Sparse Row format>

In [10]:
text_t = cv.fit_transform(new)

In [11]:
print(cv.get_feature_names_out())

['00' '000' '00am' ... 'zupas' 'zuzu' 'über']


In [95]:
# grid search

In [109]:
from sklearn.model_selection import GridSearchCV

cv = CountVectorizer(stop_words='english')
cv_text = GridSearchCV(
    cv,
    {'min_df':[1,2,4,8,10,12,16,20,22,24,26], 'max_df':[0.4,0.5,0.6,0.7,0.8,0.9]}
)

In [121]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV


rg = Ridge()
ridge = GridSearchCV(
    rg,
    {'alpha': [1,0.1,0.01,0.001,0.0001,0]},
     n_jobs=-1
)

ridge.fit(text_t, stars) 

GridSearchCV(estimator=Ridge(), n_jobs=-1,
             param_grid={'alpha': [1, 0.1, 0.01, 0.001, 0.0001, 0]})

In [122]:
ridge.best_params_

{'alpha': 1}

In [19]:
# creating the pipeline


In [160]:
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.utils import shuffle

nlp_model = Pipeline([
    ('to_text', to_text),
    ('count_vectorizer', CountVectorizer(stop_words='english', min_df=20, max_df=0.4)),
    ('ridge', Ridge(alpha=1))
])

In [124]:
nlp_model.fit(data, stars)

Pipeline(steps=[('to_text', ToText()),
                ('count_vectorizer',
                 CountVectorizer(max_df=0.8, min_df=2, stop_words='english')),
                ('ridge', Ridge(alpha=1))])

In [125]:
nlp_model.predict(data[100:110])

array([5.32622255, 4.19839414, 4.93299049, 3.52243758, 1.73235375,
       5.06337603, 3.81810765, 3.98456226, 3.39986864, 4.22608942])

In [161]:
bag_of_words_model = nlp_model

bag_of_words_model.fit(data, stars)

Pipeline(steps=[('to_text', ToText()),
                ('count_vectorizer',
                 CountVectorizer(max_df=0.4, min_df=20, stop_words='english')),
                ('ridge', Ridge(alpha=1))])

### I'll now consider both single words and pairs of consecutive words that appear. I use a vectorizer that applies some sort of normalization, e.g., the TfidfVectorizer a word count vectorizer combined with TfidfTransformer.

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=300, ngram_range=(2,2), stop_words='english', min_df=20, max_df=0.6)

text_t = cv.fit_transform(new)

In [None]:
print(cv.get_feature_names_out())

In [9]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidfT = TfidfTransformer()

In [18]:
tfidfT.fit_transform(text_t)

<253272x300 sparse matrix of type '<class 'numpy.float64'>'
	with 546620 stored elements in Compressed Sparse Row format>

In [72]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import Ridge


bigram_model = Pipeline([
    ('to_text', to_text),
    ('count_vectorizer', CountVectorizer(max_features=6000, stop_words='english', min_df=10, max_df=0.8)),
    ('tfidf', TfidfTransformer()),
    ('ridge', Ridge(alpha=1))
])

In [73]:
bigram_model.fit(data, stars)

Pipeline(steps=[('to_text', ToText()),
                ('count_vectorizer',
                 CountVectorizer(max_df=0.8, max_features=6000, min_df=10,
                                 stop_words='english')),
                ('tfidf', TfidfTransformer()), ('ridge', Ridge(alpha=1))])

In [74]:
bigram_model.score(data, stars)

0.6157500905145326

### I now want to determine the most "polarizing words" in the corpus of reviews.  In other words, I'd like to identify words that strongly signal whether a review is either positive or negative.  For example, we understand that a word like "terrible" will most likely appear in negative rather than positive reviews. I'll use Naive Bayes model to achieve this.

In [23]:
import pandas as pd


In [24]:
data = pd.DataFrame(data);data
data2 = data[data['stars'].isin([1,5])]
stars = data2['stars']
text = data2['text']


In [7]:
len(stars) == len(text)

True

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
ng_tfidf = TfidfVectorizer(stop_words='english')

In [9]:
tdi_text = ng_tfidf.fit(text)

In [13]:
import numpy as np

In [18]:
from sklearn.naive_bayes import MultinomialNB
multinom = MultinomialNB()

In [None]:
bayes_model = multinom.fit(text, stars)

In [None]:
## with pipeline

In [27]:
from sklearn.base import BaseEstimator
import pandas as pd
from sklearn.base import TransformerMixin

class ToText(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = pd.DataFrame(X)
        X2 = X[X['stars'].isin([1,5])]
        return X2['text']

In [36]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer


vect_text = Pipeline([
    ('to_text', ToText()),
    ('tfidfvectorizer', TfidfVectorizer(stop_words='english')),
])

In [43]:
vectorized = vect_text.fit_transform(data)

In [114]:
from sklearn.naive_bayes import MultinomialNB
bayes_model = MultinomialNB(alpha=0.1).fit(vectorized, stars)

In [115]:
bayes_model.feature_log_prob_

array([[ -7.23160812,  -9.09214586, -14.28867132, ..., -13.23571879,
        -13.23571879, -13.23571879],
       [ -8.32069496,  -9.78023682, -14.91158699, ..., -15.433933  ,
        -15.433933  , -15.433933  ]])

In [116]:
log_prob = bayes_model.feature_log_prob_

In [117]:
polarity1 = log_prob[1]/log_prob[0]

In [118]:
words = ng_tfidf.get_feature_names_out()

In [119]:
words

array(['00', '000', '0000', ..., '髒桌面', '鹹豆漿除了一两个米粒大的肉鬆几乎只是油條和豆漿',
       '麻辣3拼新价格'], dtype=object)

In [None]:
words_probs = list(zip(words, polarity1))


In [None]:
words_probs = sorted(words_probs, key=lambda x: x[1])


In [124]:
pos_words = words_probs[:25]

In [125]:
polarity2 = log_prob[0]/log_prob[1]

In [126]:
words_probs2 = list(zip(words, polarity2))

In [None]:
words_probs2 = sorted(words_probs2, key=lambda x: x[1])

In [129]:
neg_words = words_probs2[:25]

In [None]:
polars =  pos_words + neg_words


In [133]:
polar_words = []
for i in range(len(polars)):
    polar_words.append(polars[i][0])

Look over all reviews of restaurants.  You can determine which businesses are restaurants by looking in the `yelp_train_academic_dataset_business.json.gz` file from the ml project or downloaded below.

In [5]:
!aws s3 sync s3://dataincubator-course/mldata/ . --exclude '*' --include 'yelp_train_academic_dataset_business.json.gz'

In [6]:
with gzip.open('yelp_train_academic_dataset_business.json.gz') as f:
    business_data = [json.loads(line) for line in f]

In [7]:
import pandas as pd
business_data = pd.DataFrame(business_data)
valid_rows = business_data[business_data['categories'].apply(lambda x: 'Restaurants' in x)]

Each row of this file corresponds to a single business.  The category key gives a list of categories for each; take all where "Restaurants" appears.

In [9]:
restaurant_ids = valid_rows['business_id']

In [10]:
# Look at the categories to check for spelling and capitalization
grader.check(len(restaurant_ids) == 12876)

True

The "business_id" here is the same as in the review data.  Use this to extract the review text for all reviews of restaurants.

In [11]:
data = pd.DataFrame(data)
data2 = valid_rows.merge(data, how='left', on='business_id')
data2 = data2.dropna(subset=['text'])
restaurant_reviews = data2['text']
restaurant_reviews

0         If you like lot lizards, you'll love the Pine ...
1         Only went here once about a year and a half ag...
2         Ate a Saturday morning breakfast at the Pine C...
3         This is definitely not your usual truck stop. ...
4         I like this location better than the one near ...
                                ...                        
144941    Barely open less than a week and I've been her...
144942    Healthy Food that Keeps this Realtor on the Go...
144943    So happy to have this healthy eatery option ri...
144945    My new favorite restaurant.  They have 22 diff...
144946    GreAt food awesome service . The best fish in ...
Name: text, Length: 143361, dtype: object

In [79]:
# Just reviews of restaurants
# restaurant_ids is helpful here
grader.check(len(restaurant_reviews) == 143361)

True

### I'll now find collocations --- that is, bigrams that are "special" and appear more often than we'd expect from chance. We can think of the corpus as defining an empirical distribution over all *n*-grams.  We can find word pairs that are unlikely to occur consecutively based on the underlying probability of their words. Mathematically, if $p(w)$ be the probability of a word $w$ and $p(w_1 w_2)$ is the probability of the bigram $w_1 w_2$, then we want to look at word pairs $w_1 w_2$ where the statistic

  $$ \frac{p(w_1 w_2)}{p(w_1) p(w_2)} $$

### I'll also use smoothing parameter. 


In [13]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(ngram_range=(1,2), stop_words='english', min_df=10)


In [14]:
reviews_t = cv.fit_transform(restaurant_reviews); reviews_t

<143361x121409 sparse matrix of type '<class 'numpy.int64'>'
	with 10425911 stored elements in Compressed Sparse Row format>

In [15]:
word_list = cv.get_feature_names_out()

In [16]:
word_count = reviews_t.sum(axis=0)
word_count = word_count.tolist()
word_count = [val for sublist in word_count for val in sublist]

In [17]:
grams = dict(zip(word_list, word_count))
grams = {k: v for k, v in sorted(grams.items(), key=lambda item: item[1], reverse=True)}

In [19]:
sum(grams.values())

11827661

In [None]:
grams_prob = [i/11827661 for i in grams.values()]
grams_with_probs = list(zip(grams, grams_prob)); grams_with_probs

### Laplace Smoothing

In [107]:
grams_smooth = [ (i + 1) / (11827661 + i*1) for i in grams.values()]


### Looking at bigrams only 

In [22]:
cv2 = CountVectorizer(stop_words='english',ngram_range=(2,2), min_df=10)

In [23]:
reviews_t2 = cv2.fit_transform(restaurant_reviews)
word_list2 = cv2.get_feature_names_out()

In [26]:
word_count2 = reviews_t2.sum(axis=0)
word_count2 = word_count2.tolist()
word_count2 = [val for sublist in word_count2 for val in sublist]

In [None]:
bigrams = dict(zip(word_list2, word_count2))
bigrams = {k: v for k, v in sorted(bigrams.items(), key=lambda item: item[1], reverse=True)}
bigrams_prob = [i/11827661 for i in bigrams.values()]
bigrams_with_probs = list(zip(bigrams, bigrams_prob)); bigrams_with_probs

### Looking at each word of the bigrams

In [None]:
my_list = []

for i in range(len(bigrams_with_probs)):
    my_list.append(bigrams_with_probs[i][0].split())
my_list

In [84]:
my_list2 = []

for i in range(len(my_list)):
    my_list2.append(my_list[i][0])

In [None]:
first_part = []
for i in my_list2:
    first_part.append(grams[i]/11827661)
first_part    
  

In [None]:
## smoothing
first_part = [(grams[i] + 10) / (11827661 + grams[i]*10) for i in my_list2]; first_part


In [86]:
my_list4 = []

for i in range(len(my_list)):
    my_list4.append(my_list[i][1])

In [None]:
second_part = []
for i in my_list4:
    second_part.append(grams[i]/11827661)
second_part

In [None]:
## smoothing
second_part = [(grams[i] + 10) / (11827661 + grams[i]*10) for i in my_list4]; second_part

In [None]:
full = list(zip(first_part, second_part))
full2 = []
for elem in full:
    temp = elem[0]*elem[1]
    full2.append(temp)
full2
## negative values here . needs smoothing

In [None]:
final = list(zip(bigrams_prob, full2)); final

In [123]:
ratios = []
for elem in final:
    temp = elem[0] / elem[1]
    ratios.append(temp)
bigrams_ratios = list(zip(bigrams, ratios))
the_list = sorted(bigrams_ratios, key=lambda x: x[1], reverse=True)
top100 = the_list[:100]


In [None]:
top = []
for elem in top100:
    temp = elem[0]
    top.append(temp)
top

In [None]:

top100 = ['haricot vert'] * 100