# CSC 180 Assignment 1 - Yelp Data Challenge
## Lucas Saechao

In [1]:
import numpy as py
import pandas as pd
import sklearn as sk
import tensorflow as tf
import matplotlib.pyplot as plt
import sys
import csv
import json
import time

In [3]:
## Helper functions

# Encode textual values into indices
def encode_text_index(df, name):
    label_encoder = preprocessing.LabelEncoder()
    df[name] = label_encoder.fit_transform(df[name])
    return label_encoder.classes_

# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()
    if sd is None:
        sd = df[name].std()
    df[name] = (df[name] - mean) / sd
    
# Regression chart
def chart_regression(pred, y, sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten() })
    if sort:
        t.sort_values(by=['y'], inplace=True)
    a = plt.plot(t['y'].tolist(), label='expected')
    b = plt.plot(t['pred'].tolist(), label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Convert a Pandas DataFrame to the x, y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    
    # Find out the type of the target column
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, Sequence) else target_type
    
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bit values
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

In [8]:
# Filter out businesses with less than 20 reviews

# Convert raw data into a tsv 
outfile = open("business.tsv", 'w')
data_file = csv.writer(outfile, delimiter="\t", quoting=csv.QUOTE_MINIMAL)
data_file.writerow(['business_id', 'stars', 'review_count'])

# Open data from file path
with open('data/yelp_academic_dataset_business.json', encoding="utf-8") as f:
    for line in f:
        row = json.loads(line)
        if (row['review_count'] >= 20):
            data_file.writerow([row['business_id'], row['stars'], (row['review_count'])])
outfile.close()

In [10]:
# Create pandas dataframe output file
df_business = pd.read_csv('business.tsv', delimiter="\t", encoding="utf-8")
df_stars = df_business[['business_id', 'stars', 'review_count']]
print (df_business)
print (df_stars)

                  business_id  stars  review_count
0      f9NumwFMBDn751xgFiRbNA    3.5            36
1      51M2Kk903DFYI6gnB5I6SQ    4.5            26
2      cKyLV5oWZJ2NudWgqs8VZw    4.5            38
3      oiAlXZPIFm2nBCt0DHLu_Q    3.5            81
4      uZuh51lXu7tsrC8RAwkg1A    4.5            32
...                       ...    ...           ...
66112  YZeUH6zYS0dq5QHLYZhUnQ    2.0           106
66113  xVpE01l6ZXdEtVf5PkRpDg    4.0            95
66114  fNil19SUfPAPnLQrYnFrGQ    4.5           124
66115  JjcJVqhZXhP4tvOhg3fnag    5.0           217
66116  SYa2j1boLF8DcGVOYfHPcA    3.5            97

[66117 rows x 3 columns]
                  business_id  stars  review_count
0      f9NumwFMBDn751xgFiRbNA    3.5            36
1      51M2Kk903DFYI6gnB5I6SQ    4.5            26
2      cKyLV5oWZJ2NudWgqs8VZw    4.5            38
3      oiAlXZPIFm2nBCt0DHLu_Q    3.5            81
4      uZuh51lXu7tsrC8RAwkg1A    4.5            32
...                       ...    ...           ...
66112