## Course Project Jupyter Notebook

#### Data Files 
- business.csv 
- sample_submission.csv
- test_queries.csv
- train_reviews.csv
- user.csv
- validate_queries.csv

In [3]:
import pandas as pd
import numpy as np

## Preprocessing Business Data

Expects the csv file to be in an "all" folder in the working directory of this notebook

In [4]:
business_df = pd.read_csv("all/business.csv", engine="python")
business_df_replace = business_df.copy()
# default value to replace for ambience when it is Nan
ambience_default = str({'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': False})
business_df

Unnamed: 0,address,attributes,attributes_AcceptsInsurance,attributes_AgesAllowed,attributes_Alcohol,attributes_Ambience,attributes_BYOB,attributes_BYOBCorkage,attributes_BestNights,attributes_BikeParking,...,hours_Wednesday,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
0,595 Markham Street,,,,full_bar,"{'romantic': False, 'intimate': False, 'classy...",,,,True,...,17:0-1:0,0,43.664125,-79.411886,Southern Accent Restaurant,Palmerston,M6G 2L7,146,4.0,ON
1,2801 N 15th Ave,,,,full_bar,"{'romantic': False, 'intimate': False, 'classy...",False,yes_free,"{'monday': False, 'tuesday': True, 'friday': T...",True,...,11:0-22:0,1,33.479807,-112.091188,Original Hamburger Works,,85007,277,4.0,AZ
2,"5508 County Rd N, Ste 3",,,,full_bar,"{'romantic': False, 'intimate': False, 'classy...",,,,False,...,11:0-0:0,1,43.149488,-89.206641,Chicken Lips,,53590,102,4.5,WI
3,2227 N Rampart Blvd,,,,beer_and_wine,"{'romantic': False, 'intimate': False, 'classy...",,,,True,...,7:0-19:0,1,36.201990,-115.283122,Omelet House Summerlin,Summerlin,89128,242,4.0,NV
4,1111 W Bell Rd,,,,full_bar,"{'romantic': False, 'intimate': False, 'classy...",,,,True,...,11:0-22:0,1,33.639774,-112.087738,Manuel's Mexican Restaurant & Cantina - Bell Rd,,85023,230,3.5,AZ
5,1001 New Beginnings Dr,,,,,,,,,,...,9:0-17:0,1,36.080453,-115.038166,Central Church - Henderson,,89011,113,4.0,NV
6,5440 Walnut St,,,,full_bar,"{'romantic': False, 'intimate': False, 'classy...",,,,True,...,11:0-22:0,1,40.450866,-79.933919,China Palace,Shadyside,15232,110,3.0,PA
7,81 Underhill Drive,,,,none,"{'romantic': False, 'intimate': False, 'classy...",,,,True,...,11:30-22:0,1,43.745928,-79.324623,Allwyn's Bakery,,M3A 1K8,105,4.0,ON
8,2523 South Blvd,,,,,,,,,,...,20:0-6:0,1,35.202363,-80.864662,Long Animal Hospital,South End,28203,103,3.5,NC
9,6316 N Scottsdale Rd,,,,full_bar,"{'romantic': False, 'intimate': False, 'classy...",,,"{'monday': False, 'tuesday': False, 'friday': ...",False,...,16:30-21:30,1,33.530358,-111.925905,Fat Ox,,85253,234,4.0,AZ


### Feature Selection

I decided that we are only attempting to use features when more than half of its values are not Nan as otherwise, there are too less datapoints with a value. This percent non-Nan requirement can be changed nevertheless. 

I also decided to not look at the hours as it seems to complex to make into numerical value and would not help much in determining a user's review from intuition. The same is the case with the latitude, longitude, name, and address features. 

In [5]:
business_df_replace.drop([col for col in business_df.columns if business_df[col].isnull().sum() > 0.5 * 12058 ], axis=1, inplace=True)
business_df_replace.drop(['hours_Friday', 'hours_Monday', 'hours_Saturday',
       'hours_Sunday', 'hours_Thursday', 'hours_Tuesday', 'hours_Wednesday',
       'is_open', 'latitude', 'longitude', 'postal_code', 'name', 'address', 'categories'], axis=1, inplace=True)
business_df_replace

Unnamed: 0,attributes_Alcohol,attributes_Ambience,attributes_BikeParking,attributes_BusinessAcceptsCreditCards,attributes_BusinessParking,attributes_Caters,attributes_GoodForKids,attributes_GoodForMeal,attributes_HasTV,attributes_NoiseLevel,...,attributes_RestaurantsReservations,attributes_RestaurantsTableService,attributes_RestaurantsTakeOut,attributes_WheelchairAccessible,attributes_WiFi,business_id,city,review_count,stars,state
0,full_bar,"{'romantic': False, 'intimate': False, 'classy...",True,True,"{'garage': False, 'street': True, 'validated':...",True,False,"{'dessert': False, 'latenight': False, 'lunch'...",False,average,...,True,True,True,False,no,KuxDPl6UYNLxFChPm0_MNw,Toronto,146,4.0,ON
1,full_bar,"{'romantic': False, 'intimate': False, 'classy...",True,True,"{'garage': False, 'street': False, 'validated'...",False,True,"{'dessert': False, 'latenight': False, 'lunch'...",True,average,...,False,False,True,True,no,6SAfQKe2oM5g_EtcYXyAMg,Phoenix,277,4.0,AZ
2,full_bar,"{'romantic': False, 'intimate': False, 'classy...",False,False,"{'garage': False, 'street': False, 'validated'...",False,False,"{'dessert': False, 'latenight': False, 'lunch'...",True,average,...,False,False,True,True,no,upB0RQl-l529IVwgOpwOQQ,Sun Prairie,102,4.5,WI
3,beer_and_wine,"{'romantic': False, 'intimate': False, 'classy...",True,True,"{'garage': False, 'street': False, 'validated'...",False,True,"{'dessert': False, 'latenight': False, 'lunch'...",True,average,...,False,True,True,True,no,TulmRC5V0--dnXYd_GOSvA,Las Vegas,242,4.0,NV
4,full_bar,"{'romantic': False, 'intimate': False, 'classy...",True,True,"{'garage': False, 'street': False, 'validated'...",True,True,"{'dessert': False, 'latenight': False, 'lunch'...",True,average,...,True,True,True,True,free,yqYtY3-Po4OVPafA9Z-Xyw,Phoenix,230,3.5,AZ
5,,,,,,,,,,,...,,,,,,i90S4tfxFm0W2FZnhpJV3A,Henderson,113,4.0,NV
6,full_bar,"{'romantic': False, 'intimate': False, 'classy...",True,True,"{'garage': False, 'street': True, 'validated':...",True,True,"{'dessert': False, 'latenight': False, 'lunch'...",True,average,...,False,True,True,True,no,xqbvqZHNyj2qExHdizzd0w,Pittsburgh,110,3.0,PA
7,none,"{'romantic': False, 'intimate': False, 'classy...",True,False,"{'garage': False, 'street': False, 'validated'...",True,True,"{'dessert': False, 'latenight': False, 'lunch'...",False,average,...,False,False,True,,no,aXWN4oH8W-MVDchWia084g,Toronto,105,4.0,ON
8,,,,,,,,,,,...,,,,,,a8ACgZ_bPPT6iRQ6R7Ridg,Charlotte,103,3.5,NC
9,full_bar,"{'romantic': False, 'intimate': False, 'classy...",False,True,"{'garage': False, 'street': False, 'validated'...",True,False,"{'dessert': False, 'latenight': False, 'lunch'...",True,very_loud,...,True,True,True,True,free,V6rzs-QgnuW1CdOn-23nNw,Scottsdale,234,4.0,AZ


###  Helper Functions

##### view_column_values
Helps to view what values occur inside the column of a dataframe

##### expand_dict_to_columns
Sometimes there are columns in the dataframe in which the data is a dictionary string(such as attributes_Ambience). This function helps expand that dictionary string into extra columns with the column being the key and the row content being the value. It returns the modified dataframe.

##### replace_column_nan
There are many Nan in the data. This function replaces the Nan of a specifc column of a dataframe with one of the values that already occur. The third parameter index_of_value_count is used to specify what value to replace, the values which can be viewed using view_column_values


In [6]:
def view_column_values(df, column_name):
    return df[column_name].value_counts()

def expand_dict_to_columns(df, column_name):
    expanded_df = df[column_name].apply(lambda x : dict(eval(x))).apply(pd.Series)
    expanded_df.fillna(False, inplace=True)
    df = pd.concat([df, expanded_df], axis = 1)
    df.drop([column_name], axis=1, inplace=True)
    return df

def replace_column_nan(df, column_name, index_of_value_count):
    df[column_name] = df[column_name].fillna(df[column_name].value_counts().index[index_of_value_count])

In [7]:
view_column_values(business_df, 'stars')

4.0    4258
3.5    3052
4.5    2228
3.0    1389
2.5     487
5.0     416
2.0     151
1.5      64
1.0      13
Name: stars, dtype: int64

In [8]:
business_df['stars'].isnull().sum()

0

#### Replacing all the NaN

In [9]:
business_df_replace['attributes_Ambience'] = business_df_replace['attributes_Ambience'].fillna(ambience_default)
business_df_replace =expand_dict_to_columns(business_df_replace, 'attributes_Ambience')
replace_column_nan(business_df_replace, 'attributes_Alcohol', 0)      # default full_bar, to change to none, change last parameter to 1 
replace_column_nan(business_df_replace, 'attributes_BikeParking', 0)  # default yes parking
replace_column_nan(business_df_replace, 'attributes_BusinessAcceptsCreditCards', 1)  # default True
replace_column_nan(business_df_replace, 'attributes_BusinessParking', 0)  # default just lot parking
business_df_replace = expand_dict_to_columns(business_df_replace, 'attributes_BusinessParking')
replace_column_nan(business_df_replace, 'attributes_Caters', 0)  # default True
replace_column_nan(business_df_replace, 'attributes_GoodForKids', 0)  # default True
replace_column_nan(business_df_replace, 'attributes_HasTV', 0)  # default True
replace_column_nan(business_df_replace, 'attributes_NoiseLevel', 0)  # default Average
replace_column_nan(business_df_replace, 'attributes_OutdoorSeating', 0)  # default True
replace_column_nan(business_df_replace, 'attributes_GoodForMeal', 0)  # default good for lunch and dinner
business_df_replace = expand_dict_to_columns(business_df_replace, 'attributes_GoodForMeal')
replace_column_nan(business_df_replace, 'attributes_RestaurantsAttire', 0)  # default casual
replace_column_nan(business_df_replace, 'attributes_RestaurantsDelivery', 0)  # default false
replace_column_nan(business_df_replace, 'attributes_RestaurantsGoodForGroups', 0)  # default true
replace_column_nan(business_df_replace, 'attributes_RestaurantsPriceRange2', 0)  # default 2$ signs
replace_column_nan(business_df_replace, 'attributes_RestaurantsReservations', 0)  # default true
replace_column_nan(business_df_replace, 'attributes_RestaurantsTableService', 0)  # default true
replace_column_nan(business_df_replace, 'attributes_RestaurantsTakeOut', 0)  # default true
replace_column_nan(business_df_replace, 'attributes_WheelchairAccessible', 0)  # default true
replace_column_nan(business_df_replace, 'attributes_WiFi', 0)  # default free
replace_column_nan(business_df_replace, 'attributes_RestaurantsTableService', 0)  # default true
replace_column_nan(business_df_replace, 'attributes_RestaurantsTableService', 0)  # default true
replace_column_nan(business_df_replace, 'attributes_RestaurantsTableService', 0)  # default true

#### Changing categorical input to numerical

In [10]:
for col in business_df_replace.columns:
    if col not in ['business_id', 'stars']: 
        if business_df_replace[col].dtypes == bool:
            # true becomes 1, false becomes 0
            business_df_replace[col] *=1
        elif business_df_replace[col].dtypes != np.dtype('int32') and business_df_replace[col].dtypes != np.dtype('int64') and business_df_replace[col].dtypes != float: 
            #changes categorical values to numerical values
            business_df_replace[col] = business_df_replace[col].astype('category').cat.codes


In [11]:
#note, the business_id column is not numerical, I kept it the same
business_df_replace.isnull().sum()

attributes_Alcohol                       0
attributes_BikeParking                   0
attributes_BusinessAcceptsCreditCards    0
attributes_Caters                        0
attributes_GoodForKids                   0
attributes_HasTV                         0
attributes_NoiseLevel                    0
attributes_OutdoorSeating                0
attributes_RestaurantsAttire             0
attributes_RestaurantsDelivery           0
attributes_RestaurantsGoodForGroups      0
attributes_RestaurantsPriceRange2        0
attributes_RestaurantsReservations       0
attributes_RestaurantsTableService       0
attributes_RestaurantsTakeOut            0
attributes_WheelchairAccessible          0
attributes_WiFi                          0
business_id                              0
city                                     0
review_count                             0
stars                                    0
state                                    0
romantic                                 0
intimate   

#### Remove more irrelevant features

In [12]:

# USE THIS
business_df_replace.drop(['attributes_BikeParking',
       'attributes_BusinessAcceptsCreditCards', 'attributes_Caters',
       'attributes_GoodForKids', 'attributes_HasTV', 'attributes_NoiseLevel',
       'attributes_OutdoorSeating', 'attributes_RestaurantsAttire',
       'attributes_RestaurantsDelivery', 'attributes_RestaurantsGoodForGroups',
       'attributes_RestaurantsReservations',
       'attributes_RestaurantsTakeOut','attributes_WiFi', 'garage', 'lot', 'street', 'valet',
       'validated', 'state',
       'casual', 'classy', 'hipster', 'intimate', 'romantic', 'touristy',
       'trendy', 'upscale', 'divey', 'breakfast', 'brunch', 'dessert',
       'dinner', 'latenight', 'lunch', 'city',
                          'attributes_RestaurantsTableService', 'attributes_WheelchairAccessible'], axis=1, inplace=True)
business_df_replace.columns

Index(['attributes_Alcohol', 'attributes_RestaurantsPriceRange2',
       'business_id', 'review_count', 'stars'],
      dtype='object')

## Preprocessing User data
The users are mostly good for numerical features. However the following features 'elite', 'friends', 'name', 'yelping_since' are not but I decided to drop them as they do not intuitively seem super important. 

In [13]:
users_df = pd.read_csv("all/users.csv")
users_df_replace = users_df.copy()
users_df_replace.drop([ 'compliment_cool', 'compliment_cute', 'compliment_funny', 'compliment_hot', 'compliment_list', 'compliment_more',
                       'compliment_note', 'compliment_photos', 'compliment_plain', 'compliment_profile', 'compliment_writer','name',
                       'friends', 'fans',
                       'yelping_since', 
                       'elite',
                       'cool', 'funny',
                      ], axis=1, inplace=True)
# users_df_replace['elite'] = (users_df_replace['elite'] == True).astype(int)
# users_df_replace['yelping_since'] = users_df_replace['yelping_since'].map(lambda x: pd.to_datetime(x).timestamp())
users_df_replace.columns


Index(['average_stars', 'review_count', 'useful', 'user_id'], dtype='object')

##### Check data usability

In [14]:
#check that all data is numerical, should output nothing if it is
for col in users_df_replace.columns:
    if col not in ['user_id']: 
        if users_df_replace[col].dtypes != np.dtype('int32') and users_df_replace[col].dtypes != np.dtype('int64') and users_df_replace[col].dtypes != float:
            print(col)

In [15]:
#check that there are no Nan values
users_df_replace.isnull().sum()

average_stars    0
review_count     0
useful           0
user_id          0
dtype: int64

In [16]:
# use this
users_df_replace

Unnamed: 0,average_stars,review_count,useful,user_id
0,2.83,6,7,UxfpKHGO2dfQCdS9xLLJow
1,3.00,4,0,Kr5NDQFPPB_01-5CDmSqVg
2,3.09,10,2,wfoeMtriLwZsdRzcxNTaFA
3,4.00,4,0,aXb0kCIsIbPEEUSGomrrmA
4,4.00,1,0,sLrX2KGu3lc_JczAnsg0_Q
5,3.33,12,3,nmYitfmo-pQ1hJWDnTLwGg
6,4.00,4,0,5tm0BfJEWGJWowr3sPGb8Q
7,1.50,2,0,PzHuq79aP6G25kEv-hejOA
8,1.00,3,0,GVYg18F-Rkuk63hvtHoG5Q
9,2.33,3,0,0IqKVB1rbaDyz0wlefmiAA


In [17]:
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# cols_to_norm = ['review_count', 'useful']
# for col in cols_to_norm:
#     users_df_replace[col] = sc.fit_transform(users_df_replace[col])
# users = sc.fit_transform(users_df_replace)
# cols = users_df_replace.columns
# ids = users_df_replace['user_id']
# avg_stars = users_df_replace['average_stars']
# users = users_df_replace.copy()
# users.drop(['average_stars', 'user_id'], axis=1, inplace=True)

# users_clean = pd.DataFrame(columns=cols)
# users_df_replace['review_count']
# users_df_replace[cols_to_norm] = sc.fit_transform(users_df_replace[cols_to_norm])
# users_df_replace
# sc.fit_transform([users_df_replace['review_count']])
# users_df_replace['review_count']
# y_train = sc.fit_transform(test_df_x)

## Preprocessing Review Data

For review Data we will use the user id and buisiness id as reference and do a join with our business and user dataframes. This will the basis of our training dataset

In [18]:
sample_submission = pd.read_csv("all/sample_submission.csv")
train_reviews = pd.read_csv("all/train_reviews.csv")
train_reviews = train_reviews[['user_id', 'business_id', 'stars']]
train_reviews

Unnamed: 0,user_id,business_id,stars
0,VDh1vjzpNUJH6HfcjH8g7Q,WPCgtEG-bJt0cZtnM-x7yw,4.0
1,HnnjIuLrdhLTsRRVrrFIjA,LnnO7quTjjdTUkCshSJnkA,5.0
2,HnnjIuLrdhLTsRRVrrFIjA,sKrlmbrZWCyLIgiMihCPqw,5.0
3,HnnjIuLrdhLTsRRVrrFIjA,Lh5qnT2m2b4lvyYiMGMDkg,4.0
4,HnnjIuLrdhLTsRRVrrFIjA,54LYVM1gCGQ2UVFK9QhgTw,5.0
5,HnnjIuLrdhLTsRRVrrFIjA,08-b4GbZxOzzo9XSJsR-tw,5.0
6,2FXuEqmoQUyyzRFH9_Je0Q,6NG_A-epYEpsJSugfAaTRQ,5.0
7,HnnjIuLrdhLTsRRVrrFIjA,8FqfLM0Kv3Grr9l8bOAlCA,4.0
8,HnnjIuLrdhLTsRRVrrFIjA,O_UC_izJXcAmkm6HlEyGSA,4.0
9,TZRCpxTnEWEaiKXeqF_7ng,U-a61zpbsDNVtKm9W1aqLw,4.0


### Formatting Training Data from Preprocessed DataFrames

In [19]:
train_reviews_replace = train_reviews.copy()
reviews_denorm = pd.merge(train_reviews_replace, users_df_replace, how='left', on='user_id')
# reviews_denorm
reviews_denorm = pd.merge(reviews_denorm, business_df_replace, how='inner', on='business_id')
business_df_replace.columns
reviews_denorm = reviews_denorm.rename(columns={'stars_x': 'review_stars', 'stars_y': 'business_stars'})
train_df_y = reviews_denorm['review_stars']
train_df_x = reviews_denorm.copy()
train_df_x.drop(['review_stars', 'business_id', 'user_id'], axis=1, inplace=True)


In [20]:
reviews_denorm.groupby('review_stars').size()

review_stars
1.0     8833
2.0    13365
3.0    28302
4.0    53634
5.0    46098
dtype: int64

In [21]:
train_df_x

Unnamed: 0,average_stars,review_count_x,useful,attributes_Alcohol,attributes_RestaurantsPriceRange2,review_count_y,business_stars
0,4.50,4,0,1,2.0,128,4.0
1,3.74,1297,591,1,2.0,128,4.0
2,3.81,74,18,1,2.0,128,4.0
3,3.61,173,454,1,2.0,128,4.0
4,3.37,84,7,1,2.0,128,4.0
5,3.82,172,228,1,2.0,128,4.0
6,4.43,5,1,1,2.0,128,4.0
7,3.92,385,433,1,2.0,128,4.0
8,3.56,16,26,1,2.0,128,4.0
9,4.07,869,2070,1,2.0,128,4.0


In [22]:
#validate df will be our validation set
validate_df = pd.read_csv("all/validate_queries.csv")
validate_df_denorm = pd.merge(validate_df, users_df_replace, how='left', on='user_id')
validate_df_denorm = pd.merge(validate_df_denorm, business_df_replace, how='left', on='business_id')
validate_df_denorm = validate_df_denorm.rename(columns={'stars_x': 'review_stars', 'stars_y': 'business_stars'})
test_df_y = validate_df_denorm['review_stars']
test_df_x = validate_df_denorm.copy()
test_df_x.drop(['Unnamed: 0', 'review_stars', 'business_id', 'user_id'], axis=1, inplace=True)

train_df_x_combined = pd.concat([train_df_x, test_df_x])
train_df_y_combined = pd.concat([train_df_y, test_df_y])
# test_df_x.columns, train_df_x.columns

### Normalization
We add Z-score normalization to help with training. Results improved from previous implementations without Zscore normalization.
There are three input sets here.
- `train_x` data from users_df and business_df merged on the user_id and business_id pair in training reviews
- `test_x` data from users_df and business_df merged on the user_id and business_id pair in validate_queries
- `train_x_combined` data from the concatenation of train_x and test_x

In [23]:
from sklearn.preprocessing import StandardScaler

# usefull train_df_x['useful'] = train_df_x['useful'] / train_df_x['review_count_x'] -> this improved results
#should be normalized by the number of reviews a user has given
train_df_x['useful'] = train_df_x['useful'] / train_df_x['review_count_x']
train_df_x_combined['useful'] = train_df_x_combined['useful'] / train_df_x_combined['review_count_x']
test_df_x['useful'] = test_df_x['useful'] / test_df_x['review_count_x']

#z score normalize our inputs to our model 
scaler = StandardScaler()
replace_column_nan(train_df_x, 'useful', 0) 
scaler.fit(train_df_x)
replace_column_nan(train_df_x, 'useful', 0) 
train_x = scaler.transform(train_df_x)
replace_column_nan(test_df_x, 'useful', 0) 
scaler.fit(test_df_x)
test_x = scaler.transform(test_df_x)
replace_column_nan(train_df_x_combined, 'useful', 0) 
scaler.fit(train_df_x_combined)
train_x_combined = scaler.transform(train_df_x_combined)


  return self.partial_fit(X, y)
  
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


# Training

We used many different models from neural network classifiers to neural network regressors to decision trees.
Evaulation metrics used were primarily MSE (as it is being used in the competition) and accuracy. We iterated through some hyperparameters as well but saw that it did not have a major influence on the MSE so we mostly stuck with the defaults from sklearn. Note for the regressor we further rounded the resulting float prediction into the nearest integer.

In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, make_scorer, mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

#Function used to report accuracy and MSE score
def report_mse_accuracy(y_true, y_pred):
    print("MSE {}".format(mean_squared_error(y_true, y_pred)))
    print("Accuracy{}".format(accuracy_score(y_true, y_pred)))

#### KNN Training Model

In [None]:
for k in [15, 25, 40, 55, 80, 125, 250, 350, 500, 1000]:    
    clf = KNeighborsClassifier(n_neighbors=k, algorithm='auto', weights='distance') 
    clf.fit(train_x, train_df_y.values) 
    y_pred_train = clf.predict(train_x).round(decimals=0)  #need to round values
    y_pred = clf.predict(test_x).round(decimals=0)
    
    print("k = ", k)
    print("--------------------")
    print("training_report")
    report_mse_accuracy(train_df_y.values, y_pred_train)
    print("validation_report")
    report_mse_accuracy(test_df_y.values, y_pred)
    print("\n\n")

#### Random Forest Training Model

In [None]:
for depth in [5, 7, 10]:    
    clf = RandomForestClassifier(n_estimators=150, max_depth=depth, min_samples_split=5)
    clf.fit(train_x, train_df_y.values) 
    y_pred_train = clf.predict(train_x).round(decimals=0)  #need to round values
    y_pred = clf.predict(test_x).round(decimals=0)
    
    print("max_depth = ", depth)
    print("--------------------")
    print("training_report")
    report_mse_accuracy(train_df_y.values, y_pred_train)
    print("validation_report")
    report_mse_accuracy(test_df_y.values, y_pred)
    print("\n\n")

#### MLP Classification Training Model

In [None]:
for max_itr in [200, 500, 700]:
    for alph in [1e-4, 1e-3, 1e-2]:
        clf = MLPClassifier(solver='sgd', alpha=alph, hidden_layer_sizes=(100, 100, 8), 
                            learning_rate='adaptive', learning_rate_init= 0.0001, max_iter=max_itr)
        clf.fit(train_x, train_df_y.values) 
        y_pred_train = clf.predict(train_x).round(decimals=0)  #need to round values
        y_pred = clf.predict(test_x).round(decimals=0)

        print("max_itr = ", max_itr, " | alph = ", alph)
        print("--------------------------------")
        print("training_report")
        report_mse_accuracy(train_df_y.values, y_pred_train)
        print("validation_report")
        report_mse_accuracy(test_df_y.values, y_pred)
        print("\n\n")

#### MLP Regression Training Model

In [None]:
for max_itr in [500, 700, 1000]:
    for alph in [5e-3, 1e-3, 1e-2]:
        for hidden_layer_size in [25, 50]:
            clf = MLPRegressor(solver='sgd', alpha=alph, hidden_layer_sizes=network_topography, 
                               learning_rate='adaptive', learning_rate_init=0.0001, max_iter=max_itr)
            clf.fit(train_x, train_df_y.values) 
            y_pred_train = clf.predict(train_x).round(decimals=0)  #need to round values
            y_pred = clf.predict(test_x).round(decimals=0)

            print("max_itr = ", max_itr, " | alph = ", alph, "hidden_layer_size = ", hidden_layer_size)
            print("----------------------------------------------")
            print("training_report")
            report_mse_accuracy(train_df_y.values, y_pred_train)
            print("validation_report")
            report_mse_accuracy(test_df_y.values, y_pred)
            print("\n\n")

### Best Model

Run a training and validation report on the best model, once using the normal dataset, and again using the combined dataset.

In [25]:
clf = MLPRegressor(solver='adam', alpha=2e-4, hidden_layer_sizes=(50,50,8), learning_rate='adaptive', learning_rate_init=0.001, max_iter=500)

#Normal Dataset
clf.fit(train_x, train_df_y.values) 
y_pred_train = clf.predict(train_x).round(decimals=0)  #need to round values
y_pred = clf.predict(test_x).round(decimals=0)
print("training_report")
report_mse_accuracy(train_df_y.values, y_pred_train)
print("validation_report")
report_mse_accuracy(test_df_y.values, y_pred)
print("\n")

#Combined Dataset
clf.fit(train_x_combined, train_df_y_combined.values )
y_pred_train = clf.predict(train_x_combined).round(decimals=0)
y_pred = clf.predict(test_x).round(decimals=0)
print("training_report")
report_mse_accuracy(train_df_y_combined.values, y_pred_train)
print("validation_report")
report_mse_accuracy(test_df_y.values, y_pred)
print("\n\n")

training_report
MSE 1.0622171042121518
Accuracy0.4006536556792161
validation_report
MSE 1.23651576572079
Accuracy0.33412544681190964


training_report
MSE 1.0958768702354862
Accuracy0.4072408129440015
validation_report
MSE 1.186193262375941
Accuracy0.36793338259081015





In [26]:
clf.fit(train_x, train_df_y.values) 


MLPRegressor(activation='relu', alpha=0.0002, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(50, 50, 8), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

## Predictions
Predict the reviews of each datapoint in test_queries.csv using the model

In [27]:
test = pd.read_csv("all/test_queries.csv")
print(test.shape)
test = pd.merge(test, users_df_replace, how='left', on='user_id')
test = pd.merge(test, business_df_replace, how='left', on='business_id')

test = test.rename(columns={'stars': 'business_stars'})
test.drop(['business_id', 'user_id'], axis=1, inplace=True)
test['useful'] = test['useful'] / test['review_count_x']
scaler.fit(test)
test_vals = scaler.transform(test)
print(test_vals.shape)

(50078, 2)
(50078, 7)


  return self.partial_fit(X, y)
  # Remove the CWD from sys.path while we load stuff.


In [28]:
submit_y = clf.predict(test_vals)
submit_y = pd.Series(submit_y).round(decimals=0)
(submit_y == 0).value_counts()

False    50078
dtype: int64

In [29]:
#Run to put in submit format
submit = pd.DataFrame(columns=['stars'])
submit['stars'] = submit_y

submit.index.name = 'index'
submit.to_csv('submit_adam2.csv')

In [30]:
test = pd.read_csv("all/test_queries.csv")
test = pd.merge(test, users_df_replace, how='left', on='user_id')
test = pd.merge(test, business_df_replace, how='left', on='business_id')

In [32]:
#submit = pd.read_csv("submit_adam2.csv")
#submit_1 = pd.read_csv("submit_adam.csv")
#submit_1['stars1'] = submit_1['stars']
#df = pd.concat([submit, submit_1['stars1']], axis=1)
#df[df['stars1'] != df['stars']]