# CSC 180 Assignment 1
## Lucas Saechao

In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import tensorflow as tf
import matplotlib.pyplot as plt
import sys
import csv
import json
import time
import sklearn.feature_extraction.text as sk_extract_feature
from sklearn.model_selection import train_test_split
from sklearn import metrics
from collections.abc import Sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from IPython.display import clear_output
import os

if os.name == 'nt':
    import winsound

# Helper Functions:
## encode_text_index
Encodes textual values to indices format.

## ecnode_numeric_zscore
Computes and encodes a given column as its z-score.

## chart_regression
Prints a chart representing the model's regression.

## to_xy
Converts a pandas DataFrame into <x, y> inputs as required for TensorFlow.

In [2]:
# Encode textual values into indices
def encode_text_index(df, name):
    label_encoder = preprocessing.LabelEncoder()
    df[name] = label_encoder.fit_transform(df[name])
    return label_encoder.classes_

# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()
    if sd is None:
        sd = df[name].std()
    df[name] = (df[name] - mean) / sd
    
# Regression chart
def chart_regression(pred, y, sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten() })
    if sort:
        t.sort_values(by=['y'], inplace=True)
    a = plt.plot(t['y'].tolist(), label='expected')
    b = plt.plot(t['pred'].tolist(), label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Convert a Pandas DataFrame to the x, y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    
    # Find out the type of the target column
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bit values
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Helper function prints a progress bar to the screen for long scripts, i.e. reading large datasets.
def progress_bar(curr_progress):
    l = 20
    if isinstance (curr_progress, int):
        curr_progress = float(curr_progress)
    if not isinstance(curr_progress, float):
        curr_progress = 0
        print("error: progress var must be float\r\n")
    if curr_progress < 0:
        curr_progress = 0
        print("Halt...\r\n")
    if curr_progress >= 1:
        curr_progress = 1
        print("Done...\r\n")
    block = int(round(l * curr_progress))
    
    clear_output(wait = True)
    prog = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (l - block), curr_progress * 100)
    print(prog)

# Play a noise following the end of a script
def ping(say="I'm done"):
    if os.name == 'nt': # If this is a windows machine
        winsound.Beep(2000, 150)
        winsound.Beep(2000, 150)
        winsound.Beep(2000, 150)
    else: # If this is anything else but a windows machine
        os.system('say "{}"'.format(say))
i = 0

# Filter out business with less than 20 reviews
For each business in which its 'review_count' value is greater than 20, write to a tab separated values (tsv) file its ID, star rating, and review count. The resulting file is then read into a pandas DataFrame and printed for verification.

In [None]:
outfile = open("business.tsv", 'w')
data_file = csv.writer(outfile, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
data_file.writerow(['business_id', 'name', 'stars', 'review_count'])

# Open data from file path
with open('data/yelp_academic_dataset_business.json', encoding="utf-8") as f:
    for line in f:
        row = json.loads(line)
        if (row['review_count'] >= 20):
            data_file.writerow([row['business_id'], row['name'], row['stars'], (row['review_count'])])
        i = i + 1
        progress_bar(i / 160000)
progress_bar(1);
outfile.close()
ping("File has been read from yelp academic dataset business.json")
i = 0

Progress: [####----------------] 18.7%


In [None]:
# Create pandas dataframe output file
df_business = pd.read_csv('business.tsv', delimiter="\t", encoding="utf-8")
df_all = df_business[['business_id', 'name', 'stars', 'review_count']]
df_stars = df_business[['business_id', 'name', 'stars']]
df_reviews = df_business[['business_id', 'name','review_count']]
print (df_business)
print (df_stars)
print (df_reviews)

# Filter out review text by business and star rating
Choose a few arbitrary businesses to fetch reviews for, and map them into a dictionary. For each business in ```yelp_academic_dataset_review.json```, if its ID exists in ```business_map```, write its ID, star rating, and review text to ```reviews_by_stars.tsv```.

In [None]:
# Arbitrarily choose businesses to search reviews for
#business_map = {'SYa2j1boLF8DcGVOYfHPcA':'Five Guys', 'JjcJVqhZXhP4tvOhg3fnag':'Water Heater Pros', 'fNil19SUfPAPnLQrYnFrGQ':'Cheyenne West Animal Hospital', 'xVpE01l6ZXdEtVf5PkRpDg':'Julep', 'YZeUH6zYS0dq5QHLYZhUnQ':'Hooters'}

# write a tsv file with review text
outfile = open('reviews_by_stars.tsv', 'w')
file_writer = csv.writer(outfile, delimiter = "\t", quoting=csv.QUOTE_MINIMAL)
file_writer.writerow(['business_id', 'stars', 'text'])

# Open data from path
with open('data/yelp_academic_dataset_review.json', encoding="utf-8") as f:
    for line in f:
        row = json.loads(line)
        # pull mapped business from json
        if (row['business_id'] in df_business['business_id'].tolist()):
            file_writer.writerow( [row['business_id'], row['stars'], (row['text']).encode('utf-8')])
        progress_bar(i / 2000000)
        i = i + 1
progress_bar(1)
outfile.close()
ping("File has been read from yelp academic dataset review.json")
i = 0

In [None]:
# Create pandas dataframe output file
df = pd.read_csv('reviews_by_stars.tsv', delimiter = "\t", encoding = "utf-8")
print(df)

# Prepare data for language processing
For each business, aggregate its review data into a single column, and merge the resulting DataFrame with its corresponding star rating.

In [None]:
df_aggregate_reviews = df.groupby('business_id')['text'].sum()
df_reviews_by_stars = pd.merge(df_aggregate_reviews, df_stars, on = 'business_id')
print(df_reviews_by_stars)

# Prepare TF-IDF Vectorizer
Prepare the vectorizer and fit it to the merged ```df_reviews_by_stars``` DataFrame, and apply it to the review column. After the TF-IDF vectorizer is run, merge it with the original data frame by its review text, and drop any unnecessary columns to prepare a DataFrame for the neural network.

In [None]:
tfidf_vectorizer = sk_extract_feature.TfidfVectorizer(stop_words='english', max_features = 1000, min_df=1)
review_vector = tfidf_vectorizer.fit_transform(df_reviews_by_stars['text'])

df_vectorized_reviews = pd.DataFrame(review_vector.toarray())
df_concat_reviews = pd.concat([df_reviews_by_stars, df_vectorized_reviews], axis=1)
businesses = df_concat_reviews['name']

df_neural_network = df_concat_reviews.drop(['business_id', 'text', 'name'], axis=1)
print(df_neural_network)

In [None]:
# Prepare text dummy
x, y = to_xy(df_neural_network, "stars")

# Split training and test data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state = 40)

# Print data shape
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

# Prepare Neural Network

In [None]:
model = Sequential()

model.add(Dense(500, input_dim=x.shape[1], activation='tanh'))
model.add(Dense(250, activation='sigmoid'))
model.add(Dense(125, activation='relu'))
model.add(Dense(1))

model.compile(loss = 'mean_squared_error', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=3, verbose=2, mode='auto')
checkpoint = ModelCheckpoint(filepath="weight/best_weights.hdf5")

model.fit(x_train, y_train, validation_data=(x_test, y_test), callbacks=[monitor, checkpoint], verbose=2, epochs=1000)

model.load_weights("weight/best_weights.hdf5")

In [None]:
pred_test = model.predict(x_test)
print("Shape: {}".format(pred_test.shape))
print(pred_test)

pred_train = model.predict(x_train)
print(pred_train)

# Measure RMSE error
Root Means Squared Error is commonly used to analyze results of regression problems.

In [None]:
rmse = np.sqrt(metrics.mean_squared_error(pred_test, y_test))
print("Root Means Squared Error: {}".format(rmse))

In [None]:
for i in range(pred_test.shape[0]):
    print("{}. Business: {}, Rating: {}, Predicted Rating: {}".format(i + 1, businesses[i], y[i], pred_test[i]))

for i in range(pred_train.shape[0]):
    print("{}. Business: {}, Rating: {}, Predicted Rating: {}".format(i + 4, businesses[i + 3], y[i + 3], pred_test[i]))


# Print prediction
Merge the prediction and test columns together and print them.

In [None]:
df_y_test = pd.DataFrame(y_test, columns=['ground_truth'])
df_predicted = pd.DataFrame(pred_test, columns=['predicted'])
prediction_result = pd.concat([df_y_test, df_predicted], axis=1)
prediction_result

In [None]:
chart_regression(pred_test.flatten(), y_test, sort=True)