In [None]:
import os
import json
import string
import nltk
import pickle

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn import metrics
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib
import pandas as pd
from sklearn.preprocessing import scale
from sklearn.linear_model import SGDClassifier
import sklearn.preprocessing as preprocessing
from lightgbm import LGBMClassifier

In [None]:
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [None]:
from sklearn.cross_validation import train_test_split
from gensim.models.word2vec import Word2Vec
import gensim

In [None]:
import numpy
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

In [None]:
# Test options and evaluation metric
seed = 7
scoring = 'accuracy'
size = 300

In [None]:
#Build word vector for training set by using the average value of all word vectors in the tweet, then scale
def buildWordVector(text, size, model):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            vec += model[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

In [None]:
def getTime(date):
    dates = date.split('-')
    total = 0
    total += (int(dates[0])-1970)*3.154e7
    total += (int(dates[1])-1)*2.628e6
    total += (int(dates[2])-1)*86400
    return total

In [None]:
root_dir = ""

wordnet_lemmatizer = WordNetLemmatizer()


with open(root_dir + 'yelp.json') as data_file:
    yelp = json.load(data_file)

In [None]:
yelp_df = pd.DataFrame(yelp)

In [None]:
yelp_df_subset = yelp_df

## Train - Test Split

In [None]:
yelp_df_y = yelp_df_subset['stars']
yelp_df_x = yelp_df_subset.drop(columns=['stars', 'date', 'review_id'])
#yelp_df_x['date'] = scale(yelp_df_x['date'].apply(getTime))
yelp_df_x['text'] = yelp_df_x['text'].apply(gensim.utils.simple_preprocess)

In [None]:
# limit to categorical data using df.select_dtypes()
X = yelp_df_x.select_dtypes(include=[object]).drop(columns=['text'])

In [None]:
# TODO: create a LabelEncoder object and fit it to each feature in X


# 1. INSTANTIATE
# encode labels with value between 0 and n_classes-1.
le = preprocessing.LabelEncoder()


# 2/3. FIT AND TRANSFORM
# use df.apply() to apply le.fit_transform to all columns
X_2 = X.apply(le.fit_transform)
X_2.columns = ['bid', 'uid']

In [None]:
yelp_df_x = pd.concat([yelp_df_x, X_2], axis=1)
yelp_df_x = yelp_df_x.drop(columns=['business_id', 'user_id'])

In [None]:
yelp_df_x.head()

In [None]:
# Split into Train / Test set
x_train, x_test, y_train, y_test = train_test_split(yelp_df_x, yelp_df_y, test_size=0.2)

In [None]:
# build vocabulary and train model
model = gensim.models.Word2Vec(
        x_train['text'],
        size=size,
        window=10,
        min_count=2,
        workers=10)

In [None]:
model.train(x_train['text'], total_examples=len(x_train), epochs=10)

In [None]:
train_vecs = np.concatenate([buildWordVector(z, size, model) for z in x_train['text']])
train_vecs_scaled = scale(train_vecs)

In [None]:
model.train(x_test['text'], total_examples=len(x_test), epochs=10)

In [None]:
x_train = x_train.reset_index().drop(columns=['index', 'text'])
x_train = pd.concat([x_train, pd.DataFrame(train_vecs_scaled)], axis=1)
y_train = y_train.reset_index().drop(columns=['index']).values.ravel()

In [None]:
#Build test vectors then scale
test_vecs = np.concatenate([buildWordVector(z, size, model) for z in x_test['text']])
test_vecs_scaled = scale(test_vecs)

In [None]:
x_test = x_test.reset_index().drop(columns=['index', 'text'])
x_test = pd.concat([x_test, pd.DataFrame(test_vecs_scaled)], axis=1)
y_test = y_test.reset_index().drop(columns=['index']).values.ravel()

In [None]:
lr = SGDClassifier(loss='log', penalty='l1')
lr.fit(x_train, y_train)

In [None]:
print ('Test Accuracy: {}'.format(lr.score(x_test, y_test)))

In [None]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
#models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

In [None]:
# fit model no training data
model = LGBMClassifier(n_jobs=-1, random_state=seed)
model.fit(x_train, y_train)
# make predictions for test data
y_pred = model.predict(x_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("LGBM Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y_train)
encoded_Y = encoder.transform(y_train)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [None]:
# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(32, input_dim=305, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=1)
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(estimator, x_train, dummy_y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
pickle.dump(x_train, open("xtrain.pickle.dat", "wb"))
pickle.dump(dummy_y, open("ytrain.pickle.dat", "wb"))