In [1]:
import pandas as pd
import numpy as np
from nltk import RegexpTokenizer
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')

import gensim

from sklearn.cluster import KMeans
from sklearn import metrics
import pylab as pl
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

import emoji
import regex as re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics import pairwise_distances


import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/natalie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
reviewer_reviews_df = pd.read_csv('cleaning/cleaned_reviewer_reviews.csv', index_col=0).reset_index(drop=True)
reviewer_reviews_df.head(2)

FileNotFoundError: [Errno 2] No such file or directory: 'cleaning/cleaned_reviewer_reviews.csv'

## Doc2Vec

In [None]:
tokenizer = RegexpTokenizer(r'\w+')

def nlp_clean(data):
    new_data = []
    for d in data:
        new_str = d.lower()
        dlist = tokenizer.tokenize(new_str)
        new_data.append(dlist)
        # new_list = []
        # for token in dlist:
        #     word, pos = nltk.pos_tag([token])[0]
        #     if pos != 'NN' and pos != 'NNS' and pos != 'NNP' and pos != 'NNPS': #remove nouns that do not determine sentiments
        #         new_list.append(word)
        # new_data.append(new_list)
    return new_data

class LabeledLineSentence(object):
    def __init__(self, doc_list, labels_list):
        self.labels_list = labels_list
        self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
              yield gensim.models.doc2vec.TaggedDocument(doc, [self.labels_list[idx]])

In [None]:
# tokenize reviews
tokenized_reviews = nlp_clean(reviewer_reviews_df['cleaned_text'])

In [None]:
bigram = gensim.models.Phrases(tokenized_reviews) # higher threshold fewer phrases.
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
sentences = bigram_mod[tokenized_reviews]

model = gensim.models.Word2Vec(vector_size=1000, min_count=5, alpha=0.025, min_alpha=0.025, seed=123) #### TO TUNE
model.build_vocab(sentences)
model.train(sentences, total_examples= model.corpus_count, epochs = 10, start_alpha=0.002, end_alpha=-0.016)


## Kmeans

In [None]:
kmeans = KMeans(n_clusters=2, max_iter=1000, random_state=True, n_init=50).fit(X=model.wv.vectors.astype('double'))

In [None]:
# positive words
model.wv.similar_by_vector(kmeans.cluster_centers_[1], topn=10, restrict_vocab=None)

In [None]:
# negative words
model.wv.similar_by_vector(kmeans.cluster_centers_[0], topn=10, restrict_vocab=None)

In [None]:
positive_cluster_index = 0
positive_cluster_center = kmeans.cluster_centers_[positive_cluster_index]
negative_cluster_center = kmeans.cluster_centers_[1-positive_cluster_index]

In [None]:
words = pd.DataFrame(model.wv.index_to_key)
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: model.wv[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: kmeans.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

In [None]:
words['cluster_value'] = [1 if i==positive_cluster_index else -1 for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(kmeans.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [None]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)

## Document Sentiments

In [None]:
sentiment_map = pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [None]:
file_weighting = reviewer_reviews_df.copy()
file_weighting

In [None]:
file_weighting['tokenized'] = tokenized_reviews
file_weighting['bigram'] = file_weighting['tokenized'].apply(lambda x: bigram_mod[x])
file_weighting['bigram'] = file_weighting['bigram'].apply(lambda x: ' '.join(x))

In [None]:
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.bigram)
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weighting.bigram)

In [None]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    inspired  by function from this wonderful article: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features) 
    print(x.bigram.split())  
    return list(map(lambda y:dictionary[f'{y}'], x.bigram.split()))

In [None]:
replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x, transformed, features), axis=1)#this step takes around 3-4 minutes minutes to calculate

In [None]:
def replace_sentiment_words(word, sentiment_dict):
    '''
    replacing each word with its associated sentiment score from sentiment dict
    '''
    try:
        out = sentiment_dict[word]
    except KeyError:
        out = 0
    return out

In [None]:
replaced_closeness_scores = file_weighting.bigram.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

In [None]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.bigram, file_weighting.reviewer, file_weighting.review, file_weighting.link]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence', 'reviewer', 'review', 'link']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')

In [None]:
replacement_df[replacement_df['prediction'] == 0]['review'].iloc[800] # example of negative review

In [None]:
replacement_df[replacement_df['prediction'] == 1]['review'].iloc[700] # example of positive review

## Accuracy of Recommendations

In [None]:
reviewer_df = replacement_df[replacement_df['reviewer']=='alamakgirl']
reviewer_df[reviewer_df['prediction'] == 1] # alamakgirl likes these 

In [None]:
reviewer_df[reviewer_df['link']=='https://www.burpple.com/koh-grill-sushi-bar?bp_ref=%2Ff%2FZlJD45Z0']