In [43]:
import json
import csv
import pandas as pd
import os
import numpy as np
import string
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping, ModelCheckpoint
import io
import requests
from sklearn import metrics

In [5]:
import collections
from sklearn import preprocessing
import matplotlib.pyplot as plt
import shutil

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)



# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, collections.Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low


In [6]:
outfile = open("review_stars.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','stars', 'text'])
with open('yelp_academic_dataset_review.json',encoding="utf-8") as f:    # encoding added
    for line in f:
        row = json.loads(line)
        sfile.writerow([row['business_id'], row['stars'], (row['text']).encode('utf-8')])
outfile.close()

FileNotFoundError: [Errno 2] No such file or directory: 'yelp_academic_dataset_review.json'

In [7]:
df_reviews = pd.read_csv('review_stars.tsv', delimiter ="\t",encoding="utf-8")

EmptyDataError: No columns to parse from file

In [None]:
#json to csv of business file

outfile = open("business.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','stars', 'categories','review_count'])
with open('yelp_academic_dataset_business.json',encoding="utf-8") as f:
    for line in f:
        row = json.loads(line)
        sfile.writerow([row['business_id'], row['stars'], row['categories'],row['review_count']])
outfile.close()

In [10]:
df_business = pd.read_csv('business.tsv', delimiter ="\t", encoding="utf-8", nrows=10000)

In [29]:
df4= pd.read_csv('review_grouped.tsv')

In [30]:
df_review_business=pd.merge(df_business, df4, on='business_id')

In [31]:
encode_numeric_range(df_review_business,'review_count')

In [32]:
import sklearn.feature_extraction.text as sk_text
vectorizer = sk_text.TfidfVectorizer(stop_words='english',
                            max_features = 500,
                            min_df=1
                            #max_df=5
                            )

#min_df: ignore terms that have a document frequency < min_df.
#max_df: ignore terms that have a document frequency > max_df


matrix = vectorizer.fit_transform(df_review_business['all_reviews'])
print(type(matrix))               # Compressed Sparse Row matrix

tfidf_data = matrix.toarray()     #  convert it to numpy array
pd.set_option('display.max_colwidth',-1)
print(tfidf_data)
print(vectorizer.get_feature_names())

<class 'scipy.sparse.csr.csr_matrix'>
[[0.         0.0530015  0.         ... 0.         0.01715785 0.04262998]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.01986448 0.00542956 0.00323701 ... 0.01065834 0.01757678 0.03056959]
 [0.00267133 0.03285694 0.00652959 ... 0.00238885 0.00945473 0.00293638]
 [0.         0.         0.0257859  ... 0.         0.         0.06957601]]
['00', '10', '15', '20', '30', '50', 'able', 'absolutely', 'actually', 'add', 'ago', 'amazing', 'appointment', 'area', 'arrived', 'ask', 'asked', 'ate', 'atmosphere', 'attentive', 'authentic', 'available', 'average', 'away', 'awesome', 'bacon', 'bad', 'bar', 'bbq', 'beautiful', 'beef', 'beer', 'believe', 'best', 'better', 'big', 'birthday', 'bit', 'bite', 'bought', 'bowl', 'bread', 'breakfast', 'bring', 'brought', 'brunch', 'burger', 'burgers', 'burrito', 'business', 'busy', 'buy', 'cafe', 'cake', 'called', 'came

In [33]:
tfidf_data.shape

(10000, 500)

In [34]:
#Converting review_count in Numpy Array
review_array=df_review_business['review_count'].values
review_array.shape

(10000,)

In [35]:
#Converting review_count array into vertical stack
vertical_review=np.vstack(review_array)
#Concatenating review_text tfidf array and review_count array
x = np.concatenate((tfidf_data, vertical_review), axis=1)
x.shape

(10000, 501)

In [36]:
y=df_review_business['stars']
y.shape

(10000,)

In [37]:
b_id = df_business['business_id'][0:5]
b_id
testx = x[0:5]
x=x[5:10000]
testy = y[0:5]
y=y[5:10000]

In [38]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [39]:
from sklearn.model_selection import train_test_split
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [52]:

model = Sequential()
# 25 neurons in 1st hidden layer
model.add(Dense(25, input_dim=x_train.shape[1], activation='relu')) # Hidden 1     #  why input_dim=x.shape[1]?  
# 10 neurons in 2nd hidden layer
model.add(Dense(10, activation='relu')) # Hidden 2
model.add(Dense(1)) # Output
#optimizer - Back Prop algo
model.compile(loss='mean_squared_error', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')

checkpointer = ModelCheckpoint(filepath="best_weights.hdf5", verbose=0, save_best_only=True) # save best model

model.fit(x_train,y_train, validation_data=(x_test,y_test),callbacks=[monitor,checkpointer],verbose=2,epochs=1000)

model.load_weights('best_weights.hdf5') # load weights from best model

Train on 7496 samples, validate on 2499 samples
Epoch 1/1000
 - 2s - loss: 4.1435 - val_loss: 1.1681
Epoch 2/1000
 - 1s - loss: 0.8002 - val_loss: 0.4956
Epoch 3/1000
 - 1s - loss: 0.4186 - val_loss: 0.3672
Epoch 4/1000
 - 1s - loss: 0.3502 - val_loss: 0.3397
Epoch 5/1000
 - 1s - loss: 0.3230 - val_loss: 0.3256
Epoch 6/1000
 - 1s - loss: 0.3075 - val_loss: 0.3192
Epoch 7/1000
 - 1s - loss: 0.2968 - val_loss: 0.3185
Epoch 8/1000
 - 1s - loss: 0.2910 - val_loss: 0.3151
Epoch 9/1000
 - 1s - loss: 0.2859 - val_loss: 0.3184
Epoch 10/1000
 - 1s - loss: 0.2821 - val_loss: 0.3154
Epoch 11/1000
 - 1s - loss: 0.2801 - val_loss: 0.3167
Epoch 12/1000
 - 1s - loss: 0.2779 - val_loss: 0.3189
Epoch 13/1000
 - 1s - loss: 0.2745 - val_loss: 0.3169
Epoch 00013: early stopping


In [60]:
# Define ModelCheckpoint outside the loop

# Remove checkpointer and loop to see change in result
checkpointer = ModelCheckpoint(filepath="dnn/best_weights.hdf5", verbose=0, save_best_only=True) # save best model

for i in range(5):
    print(i)

    # Build network
    model = Sequential()
    # 25 neurons in 1st hidden layer
    model.add(Dense(25, input_dim=x_train.shape[1], activation='relu')) # Hidden 1     #  why input_dim=x.shape[1]?  
    # 10 neurons in 2nd hidden layer
    model.add(Dense(10, activation='relu')) # Hidden 2
    model.add(Dense(5, activation='relu')) # Hidden 3
    model.add(Dense(1)) # Output
    #optimizer - Back Prop algo
    model.compile(loss='mean_squared_error', optimizer='adam')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')

    model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor,checkpointer],verbose=2,epochs=100)


print('Training finished...Loading the best model')  
print()
model.load_weights('dnn/best_weights.hdf5') # load weights from best model


0
Train on 7496 samples, validate on 2499 samples
Epoch 1/100
 - 4s - loss: 2.5919 - val_loss: 0.4617
Epoch 2/100
 - 1s - loss: 0.3799 - val_loss: 0.3338
Epoch 3/100
 - 1s - loss: 0.3176 - val_loss: 0.3163
Epoch 4/100
 - 1s - loss: 0.2981 - val_loss: 0.3155
Epoch 5/100
 - 1s - loss: 0.2907 - val_loss: 0.3152
Epoch 6/100
 - 1s - loss: 0.2857 - val_loss: 0.3207
Epoch 7/100
 - 1s - loss: 0.2697 - val_loss: 0.3323
Epoch 8/100
 - 1s - loss: 0.2480 - val_loss: 0.2979
Epoch 9/100
 - 1s - loss: 0.2227 - val_loss: 0.2925
Epoch 10/100
 - 1s - loss: 0.2023 - val_loss: 0.2919
Epoch 11/100
 - 1s - loss: 0.1832 - val_loss: 0.2974
Epoch 12/100
 - 1s - loss: 0.1701 - val_loss: 0.2930
Epoch 13/100
 - 1s - loss: 0.1567 - val_loss: 0.2987
Epoch 14/100
 - 1s - loss: 0.1452 - val_loss: 0.3015
Epoch 00014: early stopping
1
Train on 7496 samples, validate on 2499 samples
Epoch 1/100
 - 3s - loss: 3.6636 - val_loss: 0.5960
Epoch 2/100
 - 1s - loss: 0.4110 - val_loss: 0.3404
Epoch 3/100
 - 1s - loss: 0.3242 - 

In [58]:
y_true.shape

()