In [None]:
%matplotlib inline

from glob import glob
import plotly.graph_objs as go
import plotly.plotly as py
from plotly import tools
import json
import matplotlib.pyplot as plt
import numpy as np
import nltk
from nltk.corpus import stopwords
import os.path as path
import pandas as pd
import random
import seaborn as sns
import sklearn
from sklearn.feature_extraction.text import CountVectorizer as count_vectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import svm
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB as multinomial_nb
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from scipy.sparse import csr_matrix
import string
import time
import operator
from sklearn.linear_model import LogisticRegression
from wordcloud import WordCloud
import keras
from keras.datasets import mnist
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Dense
from keras.layers import Activation, Conv2D, MaxPooling2D, AveragePooling2D
from keras.optimizers import SGD
from keras import backend as K
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [None]:
def distribute_data(yelp):
    yelp_subset = []
    
    # 2 parties
    # party 0 -> random set of 1000 reviews
    # party 1 -> set of 1000 only 1 & 5 reviews 
    
    # 10 parties
    # party 0 - 5 -> random set of 1000 reviews
    # party 6 -> set of 1000 only 1 & 2 reviews
    # party 7 -> set of 1000 only 2 & 3 reviews
    # party 8 -> set of 1000 only 3 & 4 reviews
    # party 9 -> set of 1000 only 4 & 5 reviews
    
    # n parties
    # party n -> random set of 1000 reviews
    
    party_0 = yelp.sample(1000)
    
    yelp_subset.append(yelp[(yelp['stars'] == 4)])
    yelp_subset.append(yelp[(yelp['stars'] == 5)])
    
    yelp_subset = pd.concat(yelp_subset)
    
    party_1 = yelp_subset.sample(1000)
    
    return (party_0, party_1)

In [None]:
def text_process(text, weak_sentiment_word_list):
    word_list = []
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    
    for word in nopunc.split():
        word = word.lower()
        if word not in stopwords.words('english'):
            if word not in weak_sentiment_word_list:
                word_list.append(word.lower())
        
    return word_list

In [None]:
def normalize_dataset(yelp):
    yelp_normalized = []
    
    yelp_1 = yelp[(yelp['stars'] == 1)]
    yelp_2 = yelp[(yelp['stars'] == 2)]
    yelp_3 = yelp[(yelp['stars'] == 3)]
    yelp_4 = yelp[(yelp['stars'] == 4)]
    yelp_5 = yelp[(yelp['stars'] == 5)]
    
    limiting_factor = min([len(yelp_1), len(yelp_2), len(yelp_3), len(yelp_4), len(yelp_5)])
        
    yelp_normalized.append(yelp_1.sample(limiting_factor))
    yelp_normalized.append(yelp_2.sample(limiting_factor))
    yelp_normalized.append(yelp_3.sample(limiting_factor))
    yelp_normalized.append(yelp_4.sample(limiting_factor))
    yelp_normalized.append(yelp_5.sample(limiting_factor))
    
    return pd.concat(yelp_normalized)

In [None]:
def generate_string(yelp, rating):
    yelp = yelp[(yelp['stars'] == rating)]
    string = []
    
    for text in yelp['tokenized']:
        for token in text:
            string.append(token)
    return pd.Series(string).str.cat(sep=' ')

In [None]:
def generate_list(string_count, length):
    word_list = []
    count = 0
    
    for word in string_count:
        if count < length:
            count += 1
            word_list.append(word[0])
        else:
            break
    
    return word_list

In [None]:
def generate_weak_sentiment_list(yelp):
    weak_sentiment_word_list = []

    yelp_negative_string = generate_string(yelp, 1)
    yelp_positive_string = generate_string(yelp, 5)

    positive_string_count = sorted(word_count(yelp_positive_string).items(), 
                                   key=operator.itemgetter(1), 
                                   reverse = True)
    
    negative_string_count = sorted(word_count(yelp_negative_string).items(), 
                                   key=operator.itemgetter(1), 
                                   reverse = True)
    
    length = int((len(positive_string_count) + len(negative_string_count)) * 0.001 / 2)
    
    positive_word_list = generate_list(positive_string_count, length)
    negative_word_list = generate_list(negative_string_count, length)
    
    for word in positive_word_list:
        if word in negative_word_list:
            weak_sentiment_word_list.append(word)
    return weak_sentiment_word_list

In [None]:
def clean_dataset(yelp):
    weak_sentiment_list = []
    
    yelp['text'] = yelp['text'].astype(str)
    yelp['length'] = yelp['text'].apply(len)
    yelp['tokenized'] = yelp.apply(lambda row: text_process(row['text'], weak_sentiment_list), axis=1)
    weak_sentiment_list = generate_weak_sentiment_list(yelp)
    yelp['tokenized'] = yelp.apply(lambda row: text_process(row['text'], weak_sentiment_list), axis=1)
    
    return yelp, weak_sentiment_list

In [None]:
def create_class(yelp, boundary):
    if not boundary:
        yelp_class = yelp
    else:
        yelp_class = yelp[(yelp['stars'] == 1) | (yelp['stars'] == 5)]
    
    yelp_class.shape
    
    return yelp_class

In [None]:
def generate_X_y(yelp_class):
    X_list = []
    
    X = yelp_class['tokenized']
    y = yelp_class['stars']
    
    for item in X:
        X = ' '.join(item)
        X_list.append(X)
    
    return X_list, y

In [None]:
def bow_transformer(X):
    bow_transformer = count_vectorizer(ngram_range=(1, 2)).fit(X)
    X = bow_transformer.transform(X)
    
    return X

In [None]:
def word_count(str):
    counts = dict()
    words = str.split()

    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1

    return counts

In [29]:
yelp = pd.read_csv('./dataset/review_small.csv')

In [30]:
yelp = normalize_dataset(yelp)

In [33]:
yelp_parties = distribute_data(yelp)

In [None]:
yelp_parties_clean = []

for party in yelp_parties:
    yelp_parties_clean.append(clean_dataset(party))