# Data Pre-Processing

In [1]:
#Saumya Shah
#Jait Purohit

import sklearn.feature_extraction.text as sk_text
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from pandas import Series

#reading json file into tabular format
import json
import csv
import pandas as pd
outfile = open("review_stars_proj.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','stars', 'text'])
with open('yelp_academic_dataset_review.json',encoding='utf-8') as f:
    for line in f:
        row = json.loads(line)
        sfile.writerow([row['business_id'], row['stars'], (row['text']).encode('utf-8')])
outfile.close()

In [2]:
#reading data from csv file into pandas dataframe
review_df = pd.read_csv('review_stars_proj.tsv', delimiter ="\t", encoding="utf-8")

In [3]:
#checking for any null values in review dataframe
review_df[review_df.isnull().any(axis=1) == True]

Unnamed: 0,business_id,stars,text


In [4]:
#grouping all the reviews by business_id
df_review_agg = review_df.groupby('business_id')['text'].sum()
df_ready_for_sklearn = pd.DataFrame({'business_id': df_review_agg.index, 'all_reviews': df_review_agg.values})

In [5]:
outfile = open("business_info_proj.tsv", 'w')
sfile = csv.writer(outfile, delimiter ="\t", quoting=csv.QUOTE_MINIMAL)
sfile.writerow(['business_id','stars_b','review_count','name'])
with open('yelp_academic_dataset_business.json',encoding='utf-8') as f:
    for line in f:
        row = json.loads(line)
        sfile.writerow([row['business_id'], row['stars'], row['review_count'], (row['name']).encode('utf-8')])
outfile.close()

In [6]:
business_df = pd.read_csv('business_info_proj.tsv', delimiter ="\t", encoding="utf-8")

In [7]:
#checking for any null values in business dataframe
business_df[business_df.isnull().any(axis=1) == True]

Unnamed: 0,business_id,stars_b,review_count,name


In [8]:
#merging business and review dataframe
mergeddf = pd.merge(df_ready_for_sklearn,business_df,on='business_id')

In [9]:
mergeddf.shape

(188593, 5)

# Feature Normalization

In [10]:
#normalizing review count to positive values using min_max function
max = mergeddf['review_count'].max()
min = mergeddf['review_count'].min()
mergeddf['normalized_reviewcount'] = (mergeddf['review_count'] - min)/(max - min)

In [11]:
mergeddf.head()

Unnamed: 0,business_id,all_reviews,stars_b,review_count,name,normalized_reviewcount
0,--1UhMGODdWsrMastO9DZw,b'Came here for lunch last week and was pleasa...,4.0,24,b'The Spicy Amigos',0.002637
1,--6MefnULPED_I942VcFNA,"b""The incredibly rude woman behind the cashier...",3.0,39,"b""John's Chinese BBQ Restaurant""",0.00452
2,--7zmmkVg-IMGaXbuVd0SQ,"b'Great beer, great place, excellent service f...",4.0,54,b'Primal Brewery',0.006403
3,--8LPVSo5i0Oo61X01sV9A,b'Dr. LaCognata is great. He is a great liste...,3.5,4,b'Valley Bone and Joint Specialists',0.000126
4,--9QQLMTbFzLJ_oT-ON3Xw,"b""I've been going to this particular location ...",3.5,11,b'Great Clips',0.001004


# Converting all-business-reviews to Tf-idf scores

In [12]:
#computing tfidf value for the review data
vectorizer = sk_text.TfidfVectorizer(stop_words='english',max_features = 500,min_df=1)
matrix = vectorizer.fit_transform(mergeddf['all_reviews'])

#converting  matrix to numpy array
tfidf_data = matrix.toarray()
#mergeddf['all_reviews']=tfidf_data

In [13]:
reviewcountarray=[]
for i in mergeddf.normalized_reviewcount:
    reviewcountarray.append(i)

In [15]:
import numpy as np
reviewcountarray = np.array([reviewcountarray])

In [16]:
reviewcountarray.shape

(1, 188593)

In [17]:
businessstarsarray=[]
for i in mergeddf.stars_b:
    businessstarsarray.append(i)

In [18]:
businessstarsarray = np.array(businessstarsarray)

In [19]:
businessstarsarray.shape

(188593,)

In [20]:
final_array=np.concatenate((tfidf_data,reviewcountarray.T),axis=1)

In [21]:
x=final_array
y=businessstarsarray

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)

# Regression 

# Linear Regression

In [29]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
#Linear Regression
linreg = LinearRegression()
# fit the model with data
linreg.fit(x_train,y_train)
# predict the response for new observations
linearprediction = linreg.predict(x_test)
#print_table(linearprediction[0:20],y_test_array[0:20])
print("R2 score : ",r2_score(y_test, linearprediction))
print("RMSE : ",mean_squared_error(y_test, linearprediction, multioutput='raw_values'))

R2 score :  0.7129015179641903
RMSE :  [0.29731851]


In [30]:
import collections
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import shutil
import os


# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = "{}-{}".format(name, x)
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)



# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_


# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd


# Convert all missing values in the specified column to the median
def missing_median(df, name):
    med = df[name].median()
    df[name] = df[name].fillna(med)


# Convert all missing values in the specified column to the default
def missing_default(df, name, default_value):
    df[name] = df[name].fillna(default_value)


# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column. 
    target_type = df[target].dtypes
    target_type = target_type[0] if isinstance(target_type, collections.Sequence) else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    else:
        # Regression
        return df[result].values.astype(np.float32), df[target].values.astype(np.float32)

# Nicely formatted time string
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)


# Regression chart.
def chart_regression(pred,y,sort=True):
    t = pd.DataFrame({'pred' : pred, 'y' : y.flatten()})
    if sort:
        t.sort_values(by=['y'],inplace=True)
    a = plt.plot(t['y'].tolist(),label='expected')
    b = plt.plot(t['pred'].tolist(),label='prediction')
    plt.ylabel('output')
    plt.legend()
    plt.show()

# Remove all rows where the specified column is +/- sd standard deviations
def remove_outliers(df, name, sd):
    drop_rows = df.index[(np.abs(df[name] - df[name].mean()) >= (sd * df[name].std()))]
    df.drop(drop_rows, axis=0, inplace=True)


# Encode a column to a range between normalized_low and normalized_high.
def encode_numeric_range(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
               * (normalized_high - normalized_low) + normalized_low


# Model Checkpointing and Early Stopping, Saving and Loading Best Weights

In [33]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
import os
import io
import requests
from sklearn import metrics


x_train_ann, x_test_ann, y_train_ann, y_test_ann = train_test_split(x, y, test_size=0.25, random_state=42)
checkpointer = ModelCheckpoint("weights/best_weights_fair.hdf5", verbose=0, save_best_only=True)    
#checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True)
for i in range(5):
    model = Sequential()
    model.add(Dense(45, input_dim=x.shape[1], activation='relu'))
    model.add(Dense(30,activation='relu'))
    model.add(Dense(15,activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience =5, verbose=2, mode='auto')
    model.fit(x_train_ann, y_train_ann, validation_data=(x_test_ann,y_test_ann), callbacks=[monitor,checkpointer], verbose=2, epochs=100)  

print('Training finished...Loading the best model')  
print()
model.load_weights("weights/best_weights_fair.hdf5") # load weights from best model
# Measure accuracy
pred = model.predict(x_test_ann)
print("r2 score: ",metrics.r2_score(y_test_ann,pred))
print("RMSE score: ",metrics.mean_squared_error(y_test_ann,pred,multioutput='raw_values'))

Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 9s - loss: 0.3852 - val_loss: 0.2463
Epoch 2/100
 - 8s - loss: 0.2356 - val_loss: 0.2375
Epoch 3/100
 - 9s - loss: 0.2250 - val_loss: 0.2334
Epoch 4/100
 - 8s - loss: 0.2173 - val_loss: 0.2427
Epoch 5/100
 - 8s - loss: 0.2102 - val_loss: 0.2365
Epoch 6/100
 - 8s - loss: 0.2043 - val_loss: 0.2383
Epoch 7/100
 - 8s - loss: 0.1981 - val_loss: 0.2355
Epoch 8/100
 - 9s - loss: 0.1924 - val_loss: 0.2516
Epoch 00008: early stopping
Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 9s - loss: 0.3691 - val_loss: 0.2492
Epoch 2/100
 - 9s - loss: 0.2331 - val_loss: 0.2424
Epoch 3/100
 - 9s - loss: 0.2227 - val_loss: 0.2313
Epoch 4/100
 - 9s - loss: 0.2149 - val_loss: 0.2312
Epoch 5/100
 - 9s - loss: 0.2075 - val_loss: 0.2331
Epoch 6/100
 - 9s - loss: 0.2003 - val_loss: 0.2380
Epoch 7/100
 - 9s - loss: 0.1940 - val_loss: 0.2369
Epoch 8/100
 - 9s - loss: 0.1883 - val_loss: 0.2399
Epoch 00008: early stopping
Train on 1

In [66]:
x_train_ann, x_test_ann, y_train_ann, y_test_ann = train_test_split(x, y, test_size=0.25, random_state=42)
checkpointer = ModelCheckpoint("weights/best_weights_fair1.hdf5", verbose=0, save_best_only=True)    
#checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True)
for i in range(5):
    model = Sequential()
    model.add(Dense(60, input_dim=x.shape[1], activation='sigmoid'))
    model.add(Dense(45,activation='sigmoid'))
    model.add(Dense(30,activation='sigmoid'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience =5, verbose=2, mode='auto')
    model.fit(x_train_ann, y_train_ann, validation_data=(x_test_ann,y_test_ann), callbacks=[monitor,checkpointer], verbose=2, epochs=100)  

print('Training finished...Loading the best model')  
print()
model.load_weights("weights/best_weights_fair1.hdf5") # load weights from best model
# Measure accuracy
pred = model.predict(x_test_ann)
print("r2 score: ",metrics.r2_score(y_test_ann,pred))
print("RMSE score: ",metrics.mean_squared_error(y_test_ann,pred,multioutput='raw_values'))

Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 15s - loss: 0.4345 - val_loss: 0.2610
Epoch 2/100
 - 13s - loss: 0.2523 - val_loss: 0.2492
Epoch 3/100
 - 13s - loss: 0.2411 - val_loss: 0.2407
Epoch 4/100
 - 12s - loss: 0.2333 - val_loss: 0.2373
Epoch 5/100
 - 18s - loss: 0.2286 - val_loss: 0.2333
Epoch 6/100
 - 14s - loss: 0.2262 - val_loss: 0.2329
Epoch 7/100
 - 12s - loss: 0.2248 - val_loss: 0.2303
Epoch 8/100
 - 12s - loss: 0.2237 - val_loss: 0.2304
Epoch 9/100
 - 13s - loss: 0.2225 - val_loss: 0.2315
Epoch 10/100
 - 14s - loss: 0.2212 - val_loss: 0.2288
Epoch 11/100
 - 13s - loss: 0.2202 - val_loss: 0.2299
Epoch 12/100
 - 13s - loss: 0.2188 - val_loss: 0.2290
Epoch 13/100
 - 13s - loss: 0.2177 - val_loss: 0.2263
Epoch 14/100
 - 13s - loss: 0.2167 - val_loss: 0.2256
Epoch 15/100
 - 13s - loss: 0.2154 - val_loss: 0.2318
Epoch 16/100
 - 13s - loss: 0.2145 - val_loss: 0.2265
Epoch 17/100
 - 13s - loss: 0.2132 - val_loss: 0.2265
Epoch 18/100
 - 13s - loss: 0.2125 - val

# Hypertuning of Parameters on Regression of Artificial Neural Network

In [67]:
x_train_ann, x_test_ann, y_train_ann, y_test_ann = train_test_split(x, y, test_size=0.25, random_state=42)
checkpointer = ModelCheckpoint("weights/best_weights_fair2.hdf5", verbose=0, save_best_only=True)    
#checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True)
for i in range(5):
    model = Sequential()
    model.add(Dense(50, input_dim=x.shape[1], activation='sigmoid'))
    model.add(Dense(35,activation='tanh'))
    #model.add(Dense(15,activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience =5, verbose=2, mode='auto')
    model.fit(x_train_ann, y_train_ann, validation_data=(x_test_ann,y_test_ann), callbacks=[monitor,checkpointer], verbose=2, epochs=100)  

print('Training finished...Loading the best model')  
print()
model.load_weights("weights/best_weights_fair2.hdf5") # load weights from best model
# Measure accuracy
pred = model.predict(x_test_ann)
print("r2 score: ",metrics.r2_score(y_test_ann,pred))
print("RMSE score: ",metrics.mean_squared_error(y_test_ann,pred,multioutput='raw_values'))

Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 14s - loss: 0.3279 - val_loss: 0.2667
Epoch 2/100
 - 12s - loss: 0.2556 - val_loss: 0.2471
Epoch 3/100
 - 12s - loss: 0.2411 - val_loss: 0.2388
Epoch 4/100
 - 12s - loss: 0.2334 - val_loss: 0.2351
Epoch 5/100
 - 12s - loss: 0.2295 - val_loss: 0.2331
Epoch 6/100
 - 12s - loss: 0.2274 - val_loss: 0.2319
Epoch 7/100
 - 12s - loss: 0.2255 - val_loss: 0.2319
Epoch 8/100
 - 12s - loss: 0.2242 - val_loss: 0.2432
Epoch 9/100
 - 12s - loss: 0.2232 - val_loss: 0.2326
Epoch 10/100
 - 12s - loss: 0.2218 - val_loss: 0.2294
Epoch 11/100
 - 12s - loss: 0.2210 - val_loss: 0.2284
Epoch 12/100
 - 12s - loss: 0.2202 - val_loss: 0.2312
Epoch 13/100
 - 12s - loss: 0.2189 - val_loss: 0.2383
Epoch 14/100
 - 12s - loss: 0.2182 - val_loss: 0.2321
Epoch 15/100
 - 12s - loss: 0.2173 - val_loss: 0.2280
Epoch 16/100
 - 12s - loss: 0.2162 - val_loss: 0.2272
Epoch 17/100
 - 12s - loss: 0.2152 - val_loss: 0.2262
Epoch 18/100
 - 12s - loss: 0.2144 - val

In [68]:
x_train_ann, x_test_ann, y_train_ann, y_test_ann = train_test_split(x, y, test_size=0.25, random_state=42)
checkpointer = ModelCheckpoint("weights/best_weights_fair3.hdf5", verbose=0, save_best_only=True)    
#checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True)
for i in range(5):
    model = Sequential()
    model.add(Dense(40, input_dim=x.shape[1], activation='relu'))
    model.add(Dense(20,activation='sigmoid'))
    #model.add(Dense(15,activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience =5, verbose=2, mode='auto')
    model.fit(x_train_ann, y_train_ann, validation_data=(x_test_ann,y_test_ann), callbacks=[monitor,checkpointer], verbose=2, epochs=100)  

print('Training finished...Loading the best model')  
print()
model.load_weights("weights/best_weights_fair3.hdf5") # load weights from best model
# Measure accuracy
pred = model.predict(x_test_ann)
print("r2 score: ",metrics.r2_score(y_test_ann,pred))
print("RMSE score: ",metrics.mean_squared_error(y_test_ann,pred,multioutput='raw_values'))

Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 16s - loss: 0.6562 - val_loss: 0.2388
Epoch 2/100
 - 17s - loss: 0.2293 - val_loss: 0.2317
Epoch 3/100
 - 12s - loss: 0.2212 - val_loss: 0.2276
Epoch 4/100
 - 12s - loss: 0.2152 - val_loss: 0.2317
Epoch 5/100
 - 12s - loss: 0.2102 - val_loss: 0.2267
Epoch 6/100
 - 12s - loss: 0.2060 - val_loss: 0.2268
Epoch 7/100
 - 12s - loss: 0.2019 - val_loss: 0.2281
Epoch 8/100
 - 12s - loss: 0.1986 - val_loss: 0.2290
Epoch 00008: early stopping
Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 15s - loss: 0.4129 - val_loss: 0.2397
Epoch 2/100
 - 13s - loss: 0.2292 - val_loss: 0.2366
Epoch 3/100
 - 13s - loss: 0.2195 - val_loss: 0.2285
Epoch 4/100
 - 13s - loss: 0.2125 - val_loss: 0.2285
Epoch 5/100
 - 14s - loss: 0.2063 - val_loss: 0.2300
Epoch 6/100
 - 13s - loss: 0.2014 - val_loss: 0.2313
Epoch 7/100
 - 13s - loss: 0.1970 - val_loss: 0.2346
Epoch 8/100
 - 13s - loss: 0.1932 - val_loss: 0.2352
Epoch 00008: early sto

In [69]:
x_train_ann, x_test_ann, y_train_ann, y_test_ann = train_test_split(x, y, test_size=0.25, random_state=42)
checkpointer = ModelCheckpoint("weights/best_weights_fair4.hdf5", verbose=0, save_best_only=True)    
#checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True)
for i in range(5):
    model = Sequential()
    model.add(Dense(60, input_dim=x.shape[1], activation='relu'))
    model.add(Dense(45,activation='relu'))
    model.add(Dense(30,activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience =5, verbose=2, mode='auto')
    model.fit(x_train_ann, y_train_ann, validation_data=(x_test_ann,y_test_ann), callbacks=[monitor,checkpointer], verbose=2, epochs=100)  

print('Training finished...Loading the best model')  
print()
model.load_weights("weights/best_weights_fair4.hdf5") # load weights from best model
# Measure accuracy
pred = model.predict(x_test_ann)
print("r2 score: ",metrics.r2_score(y_test_ann,pred))
print("RMSE score: ",metrics.mean_squared_error(y_test_ann,pred,multioutput='raw_values'))

Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 18s - loss: 0.3523 - val_loss: 0.2546
Epoch 2/100
 - 15s - loss: 0.2340 - val_loss: 0.2344
Epoch 3/100
 - 16s - loss: 0.2223 - val_loss: 0.2341
Epoch 4/100
 - 15s - loss: 0.2124 - val_loss: 0.2300
Epoch 5/100
 - 17s - loss: 0.2040 - val_loss: 0.2346
Epoch 6/100
 - 16s - loss: 0.1953 - val_loss: 0.2364
Epoch 7/100
 - 15s - loss: 0.1872 - val_loss: 0.2393
Epoch 8/100
 - 16s - loss: 0.1800 - val_loss: 0.2412
Epoch 9/100
 - 17s - loss: 0.1743 - val_loss: 0.2441
Epoch 00009: early stopping
Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 19s - loss: 0.3577 - val_loss: 0.2501
Epoch 2/100
 - 16s - loss: 0.2310 - val_loss: 0.2369
Epoch 3/100
 - 16s - loss: 0.2193 - val_loss: 0.2322
Epoch 4/100
 - 16s - loss: 0.2100 - val_loss: 0.2301
Epoch 5/100
 - 16s - loss: 0.2011 - val_loss: 0.2344
Epoch 6/100
 - 15s - loss: 0.1931 - val_loss: 0.2383
Epoch 7/100
 - 15s - loss: 0.1851 - val_loss: 0.2426
Epoch 8/100
 - 15s - l

In [71]:
x_train_ann, x_test_ann, y_train_ann, y_test_ann = train_test_split(x, y, test_size=0.25, random_state=42)
checkpointer = ModelCheckpoint("weights/best_weights_fair4.hdf5", verbose=0, save_best_only=True)    
#checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True)
for i in range(5):
    model = Sequential()
    model.add(Dense(40, input_dim=x.shape[1], activation='relu'))
    model.add(Dense(20,activation='relu'))
    #model.add(Dense(30,activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='sgd')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience =5, verbose=2, mode='auto')
    model.fit(x_train_ann, y_train_ann, validation_data=(x_test_ann,y_test_ann), callbacks=[monitor,checkpointer], verbose=2, epochs=100)  

print('Training finished...Loading the best model')  
print()
model.load_weights("weights/best_weights_fair4.hdf5") # load weights from best model
# Measure accuracy
pred = model.predict(x_test_ann)
print("r2 score: ",metrics.r2_score(y_test_ann,pred))
print("RMSE score: ",metrics.mean_squared_error(y_test_ann,pred,multioutput='raw_values'))

Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 16s - loss: 0.3886 - val_loss: 0.2768
Epoch 2/100
 - 13s - loss: 0.2631 - val_loss: 0.2544
Epoch 3/100
 - 15s - loss: 0.2493 - val_loss: 0.2479
Epoch 4/100
 - 13s - loss: 0.2425 - val_loss: 0.2538
Epoch 5/100
 - 12s - loss: 0.2378 - val_loss: 0.2410
Epoch 6/100
 - 12s - loss: 0.2345 - val_loss: 0.2620
Epoch 7/100
 - 14s - loss: 0.2323 - val_loss: 0.2779
Epoch 8/100
 - 16s - loss: 0.2299 - val_loss: 0.2373
Epoch 9/100
 - 15s - loss: 0.2279 - val_loss: 0.2403
Epoch 10/100
 - 14s - loss: 0.2262 - val_loss: 0.2381
Epoch 11/100
 - 16s - loss: 0.2247 - val_loss: 0.2367
Epoch 12/100
 - 14s - loss: 0.2237 - val_loss: 0.2543
Epoch 13/100
 - 13s - loss: 0.2221 - val_loss: 0.2493
Epoch 00013: early stopping
Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 18s - loss: 0.3998 - val_loss: 0.2975
Epoch 2/100
 - 13s - loss: 0.2683 - val_loss: 0.2588
Epoch 3/100
 - 16s - loss: 0.2508 - val_loss: 0.3281
Epoch 4/100
 - 14s

In [72]:
x_train_ann, x_test_ann, y_train_ann, y_test_ann = train_test_split(x, y, test_size=0.25, random_state=42)
checkpointer = ModelCheckpoint("weights/best_weights_fair6.hdf5", verbose=0, save_best_only=True)    
#checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True)
for i in range(5):
    model = Sequential()
    model.add(Dense(50, input_dim=x.shape[1], activation='relu'))
    model.add(Dense(35,activation='sigmoid'))
    #model.add(Dense(30,activation='sigmoid'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience =5, verbose=2, mode='auto')
    model.fit(x_train_ann, y_train_ann, validation_data=(x_test_ann,y_test_ann), callbacks=[monitor,checkpointer], verbose=2, epochs=100)  

print('Training finished...Loading the best model')  
print()
model.load_weights("weights/best_weights_fair6.hdf5") # load weights from best model
# Measure accuracy
pred = model.predict(x_test_ann)
print("r2 score: ",metrics.r2_score(y_test_ann,pred))
print("RMSE score: ",metrics.mean_squared_error(y_test_ann,pred,multioutput='raw_values'))

Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 17s - loss: 0.3724 - val_loss: 0.2365
Epoch 2/100
 - 15s - loss: 0.2264 - val_loss: 0.2301
Epoch 3/100
 - 15s - loss: 0.2181 - val_loss: 0.2281
Epoch 4/100
 - 15s - loss: 0.2105 - val_loss: 0.2301
Epoch 5/100
 - 15s - loss: 0.2032 - val_loss: 0.2301
Epoch 6/100
 - 15s - loss: 0.1969 - val_loss: 0.2338
Epoch 7/100
 - 15s - loss: 0.1912 - val_loss: 0.2401
Epoch 8/100
 - 15s - loss: 0.1863 - val_loss: 0.2380
Epoch 00008: early stopping
Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 18s - loss: 0.4178 - val_loss: 0.2378
Epoch 2/100
 - 15s - loss: 0.2268 - val_loss: 0.2287
Epoch 3/100
 - 15s - loss: 0.2175 - val_loss: 0.2277
Epoch 4/100
 - 15s - loss: 0.2099 - val_loss: 0.2271
Epoch 5/100
 - 15s - loss: 0.2033 - val_loss: 0.2311
Epoch 6/100
 - 15s - loss: 0.1975 - val_loss: 0.2295
Epoch 7/100
 - 15s - loss: 0.1921 - val_loss: 0.2316
Epoch 8/100
 - 15s - loss: 0.1872 - val_loss: 0.2376
Epoch 9/100
 - 15s - l

In [74]:
x_train_ann, x_test_ann, y_train_ann, y_test_ann = train_test_split(x, y, test_size=0.25, random_state=42)
checkpointer = ModelCheckpoint("weights/best_weights_fair7.hdf5", verbose=0, save_best_only=True)    
#checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True)
for i in range(5):
    model = Sequential()
    model.add(Dense(100, input_dim=x.shape[1], activation='relu'))
    model.add(Dense(85,activation='relu'))
    model.add(Dense(65,activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience =5, verbose=2, mode='auto')
    model.fit(x_train_ann, y_train_ann, validation_data=(x_test_ann,y_test_ann), callbacks=[monitor,checkpointer], verbose=2, epochs=100)  

print('Training finished...Loading the best model')  
print()
model.load_weights("weights/best_weights_fair7.hdf5") # load weights from best model
# Measure accuracy
pred = model.predict(x_test_ann)
print("r2 score: ",metrics.r2_score(y_test_ann,pred))
print("RMSE score: ",metrics.mean_squared_error(y_test_ann,pred,multioutput='raw_values'))

Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 22s - loss: 0.3249 - val_loss: 0.2463
Epoch 2/100
 - 19s - loss: 0.2319 - val_loss: 0.2355
Epoch 3/100
 - 19s - loss: 0.2179 - val_loss: 0.2350
Epoch 4/100
 - 19s - loss: 0.2062 - val_loss: 0.2327
Epoch 5/100
 - 22s - loss: 0.1935 - val_loss: 0.2385
Epoch 6/100
 - 22s - loss: 0.1813 - val_loss: 0.2444
Epoch 7/100
 - 24s - loss: 0.1696 - val_loss: 0.2492
Epoch 8/100
 - 21s - loss: 0.1598 - val_loss: 0.2541
Epoch 9/100
 - 19s - loss: 0.1517 - val_loss: 0.2598
Epoch 00009: early stopping
Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 22s - loss: 0.3318 - val_loss: 0.2445
Epoch 2/100
 - 19s - loss: 0.2320 - val_loss: 0.2347
Epoch 3/100
 - 20s - loss: 0.2189 - val_loss: 0.2494
Epoch 4/100
 - 19s - loss: 0.2064 - val_loss: 0.2362
Epoch 5/100
 - 19s - loss: 0.1944 - val_loss: 0.2401
Epoch 6/100
 - 19s - loss: 0.1819 - val_loss: 0.2457
Epoch 7/100
 - 19s - loss: 0.1705 - val_loss: 0.2650
Epoch 00007: early sto

# Classification

In [34]:
le = preprocessing.LabelEncoder()
mergeddf['encoded_stars'] = le.fit_transform(mergeddf['stars_b'])

In [35]:
encodedstarsarray=[]
for i in mergeddf.encoded_stars:
    encodedstarsarray.append(i)
encodedstarsarray = np.array(encodedstarsarray)
encodedstarsarray.shape

(188593,)

In [36]:
y_class=encodedstarsarray

In [104]:
x_train, x_test, y_train, y_test = train_test_split(x,y_class, test_size=0.2, random_state=42)

# Logistic Regression

In [40]:
#Logistic Regression
from sklearn import metrics
logreg = LogisticRegression()
# fit the model with data
logreg.fit(x_train,y_train)
# predict the response for new observations
logisticprediction = logreg.predict(x_test)


In [85]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [86]:
print("Accuracy: ", metrics.accuracy_score(y_test,logisticprediction))
print("Precision: ", metrics.precision_score(y_test,logisticprediction, pos_label=1, average='weighted'))
print("Recall: ", metrics.recall_score(y_test,logisticprediction, pos_label=1, average='weighted'))
print("F1_Score: ", metrics.f1_score(y_test,logisticprediction, pos_label=1, average='weighted'))
print("Confusion Matrix: ", metrics.confusion_matrix(y_test,logisticprediction))

Accuracy:  0.44052069248919645
Precision:  0.42974995187402026
Recall:  0.44052069248919645
F1_Score:  0.42130942328868476
Confusion Matrix:  [[ 393   24  108  247   29   37    3    3    5]
 [ 114   31  253  397   79   75    7    2    2]
 [ 115   18  306  781  494  276   45    6   16]
 [ 132    2  128  910 1214  891  189   22   88]
 [  16    0   48  490 1524 1993  631   68  118]
 [  26    2   20  281  640 2978 2087  211  542]
 [   2    0    6   79  168 1442 3710  795  784]
 [   0    0    1   15   34  377 1760 1590 1589]
 [   2    0    1    6    8  143  296  620 5174]]


# Nearest Neighbor (KNN)

In [48]:
#Nearest Neighbor
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train,y_train)
# predict the response for new observations
knnprediction = knn.predict(x_test)


In [None]:
print("Accuracy: ", metrics.accuracy_score(y_test,knnprediction1))
print("Precision: ", metrics.precision_score(y_test,knnprediction1, pos_label=1, average='weighted'))
print("Recall: ", metrics.recall_score(y_test,knnprediction1, pos_label=1, average='weighted'))
print("F1_Score: ", metrics.f1_score(y_test,knnprediction1, pos_label=1, average='weighted'))
print("Confusion Matrix: ", metrics.confusion_matrix(y_test,knnprediction1))

# SVM

In [93]:
x_svm=x[:40000]
y_class_svm=y_class[:40000]

In [94]:
x_train, x_test, y_train, y_test = train_test_split(x_svm,y_class_svm, test_size=0.2, random_state=42)

In [95]:
from sklearn.svm import SVC

clf = SVC(kernel='linear')
clf.fit(x_train, y_train)

svmprediction = clf.predict(x_test)

In [97]:
print("Accuracy: ", metrics.accuracy_score(y_test,svmprediction))
print("Precision: ", metrics.precision_score(y_test,svmprediction, pos_label=1, average='weighted'))
print("Recall: ", metrics.recall_score(y_test,svmprediction, pos_label=1, average='weighted'))
print("F1_Score: ", metrics.f1_score(y_test,svmprediction, pos_label=1, average='weighted'))
print("Confusion Matrix: ", metrics.confusion_matrix(y_test,svmprediction))

Accuracy:  0.464375
Precision:  0.4596530725984863
Recall:  0.464375
F1_Score:  0.4563349633179957
Confusion Matrix:  [[ 96   7  39  35   2   2   0   0   0]
 [ 38   7  87  61   6   5   0   0   0]
 [ 26   5 132 209  59  22   5   0   1]
 [ 30   5  79 247 213  95  34   2   6]
 [  7   0  31 189 428 355  94   6   5]
 [  3   2  10  83 187 647 340  48  77]
 [  0   1   1  30  75 343 794 181  98]
 [  0   0   0   5  12  73 395 394 263]
 [  0   0   0   0   2  24  94 178 970]]


# Multinomial Naive Bayes

In [51]:
#Multinomial Naive Bayes
clf = MultinomialNB()
clf.fit(x_train, y_train)
mnbprediction = clf.predict(x_test)

In [52]:
print("Accuracy: ", metrics.accuracy_score(y_test,mnbprediction))
print("Precision: ", metrics.precision_score(y_test,mnbprediction, pos_label=1, average='weighted'))
print("Recall: ", metrics.recall_score(y_test,mnbprediction, pos_label=1, average='weighted'))
print("F1_Score: ", metrics.f1_score(y_test,mnbprediction, pos_label=1, average='weighted'))
print("Confusion Matrix: ", metrics.confusion_matrix(y_test,mnbprediction))

Accuracy:  0.3366208011877303
Confusion Matrix:  [[ 310    0   13  309   28   79   11    1   98]
 [ 227    0   31  386   86  156   16    5   53]
 [ 212    0   36  714  212  582   74    7  220]
 [ 138    0   28  912  259 1309  216   27  687]
 [  56    0    6  727  144 2348  658   91  858]
 [  29    0    1  416   54 2614 1551  323 1799]
 [   5    0    0  133   12 1220 2934  471 2211]
 [   1    0    0   28    1  222 2023  505 2586]
 [   0    0    0    7    0   41  660  300 5242]]


In [53]:
columns=['stars']

a = y_class

stars_df = pd.DataFrame(a.reshape(len(a),1 ),columns=columns)
stars_df[:10]

Unnamed: 0,stars
0,6
1,4
2,6
3,5
4,5
5,6
6,5
7,4
8,4
9,6


In [54]:
encode_text_dummy(stars_df,'stars')

In [55]:
one_hot_stars = stars_df.values

In [56]:
one_hot_stars[:10]

array([[0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0]], dtype=uint8)

# Model Checkpointing and Early Stopping, Saving and Loading Best Weights

In [64]:
x_train_ann_class, x_test_ann_class, y_train_ann_class, y_test_ann_class = train_test_split(x, one_hot_stars, test_size=0.25, random_state=42)
checkpointer = ModelCheckpoint(filepath="dnn/best_weights_fairclass.hdf5", verbose=0, save_best_only=True) # save best model

for i in range(5):
    model = Sequential()
    model.add(Dense(15, input_dim=x.shape[1], activation='relu')) # Hidden 1
    model.add(Dense(45, activation='relu')) # Hidden 2
    model.add(Dense(30, activation='relu')) # Hidden 3
    model.add(Dense(one_hot_stars.shape[1], activation='softmax')) # Output
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
    model.fit(x_train_ann_class, y_train_ann_class, validation_data=(x_test_ann_class,y_test_ann_class), callbacks=[monitor,checkpointer], verbose=2, epochs=100)  

    

model.load_weights('dnn/best_weights_fairclass.hdf5')
pred = model.predict(x_test_ann_class)
pred = np.argmax(pred,axis=1)
y_true= np.argmax(y_test_ann_class,axis=1) 
score = metrics.accuracy_score(y_true, pred)
print("Accuracy score: {}".format(score))
print("Confusion Matrix: {}".format(metrics.confusion_matrix(y_true, pred)))

Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 10s - loss: 1.2701 - val_loss: 1.2100
Epoch 2/100
 - 9s - loss: 1.1877 - val_loss: 1.1863
Epoch 3/100
 - 9s - loss: 1.1659 - val_loss: 1.1759
Epoch 4/100
 - 9s - loss: 1.1540 - val_loss: 1.1633
Epoch 5/100
 - 9s - loss: 1.1458 - val_loss: 1.1668
Epoch 6/100
 - 9s - loss: 1.1396 - val_loss: 1.1637
Epoch 7/100
 - 9s - loss: 1.1333 - val_loss: 1.1680
Epoch 8/100
 - 9s - loss: 1.1288 - val_loss: 1.1571
Epoch 9/100
 - 10s - loss: 1.1243 - val_loss: 1.1546
Epoch 10/100
 - 11s - loss: 1.1209 - val_loss: 1.1581
Epoch 11/100
 - 10s - loss: 1.1174 - val_loss: 1.1544
Epoch 12/100
 - 9s - loss: 1.1134 - val_loss: 1.1615
Epoch 13/100
 - 10s - loss: 1.1108 - val_loss: 1.1674
Epoch 14/100
 - 9s - loss: 1.1086 - val_loss: 1.1519
Epoch 15/100
 - 9s - loss: 1.1063 - val_loss: 1.1513
Epoch 16/100
 - 9s - loss: 1.1042 - val_loss: 1.1488
Epoch 17/100
 - 9s - loss: 1.1024 - val_loss: 1.1490
Epoch 18/100
 - 10s - loss: 1.1000 - val_loss: 1.155

# Hypertuning of Parameters on Classification of Artificial Neural Network

In [59]:
x_train_ann_class, x_test_ann_class, y_train_ann_class, y_test_ann_class = train_test_split(x, one_hot_stars, test_size=0.25, random_state=42)
checkpointer = ModelCheckpoint(filepath="dnn/best_weights_fairclass1.hdf5", verbose=0, save_best_only=True) # save best model

for i in range(5):
    model = Sequential()
    model.add(Dense(60, input_dim=x.shape[1], activation='sigmoid')) # Hidden 1
    model.add(Dense(45, activation='sigmoid')) # Hidden 2
    model.add(Dense(30, activation='sigmoid')) # Hidden 3
    model.add(Dense(one_hot_stars.shape[1], activation='softmax')) # Output
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
    model.fit(x_train_ann_class, y_train_ann_class, validation_data=(x_test_ann_class,y_test_ann_class), callbacks=[monitor,checkpointer], verbose=2, epochs=100)  

    

model.load_weights('dnn/best_weights_fairclass1.hdf5')
pred = model.predict(x_test_ann_class)
pred = np.argmax(pred,axis=1)
y_true= np.argmax(y_test_ann_class,axis=1) 
score = metrics.accuracy_score(y_true, pred)
print("Accuracy score: {}".format(score))
print("Confusion Matrix: {}".format(metrics.confusion_matrix(y_true, pred)))

Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 10s - loss: 1.4252 - val_loss: 1.2595
Epoch 2/100
 - 12s - loss: 1.2217 - val_loss: 1.2228
Epoch 3/100
 - 11s - loss: 1.1983 - val_loss: 1.2037
Epoch 4/100
 - 12s - loss: 1.1849 - val_loss: 1.1949
Epoch 5/100
 - 8s - loss: 1.1748 - val_loss: 1.1840
Epoch 6/100
 - 8s - loss: 1.1660 - val_loss: 1.1799
Epoch 7/100
 - 8s - loss: 1.1598 - val_loss: 1.1828
Epoch 8/100
 - 8s - loss: 1.1544 - val_loss: 1.1707
Epoch 9/100
 - 8s - loss: 1.1501 - val_loss: 1.1660
Epoch 10/100
 - 8s - loss: 1.1472 - val_loss: 1.1671
Epoch 11/100
 - 8s - loss: 1.1441 - val_loss: 1.1673
Epoch 12/100
 - 8s - loss: 1.1406 - val_loss: 1.1640
Epoch 13/100
 - 8s - loss: 1.1377 - val_loss: 1.1628
Epoch 14/100
 - 8s - loss: 1.1349 - val_loss: 1.1701
Epoch 15/100
 - 8s - loss: 1.1320 - val_loss: 1.1545
Epoch 16/100
 - 8s - loss: 1.1290 - val_loss: 1.1550
Epoch 17/100
 - 8s - loss: 1.1264 - val_loss: 1.1504
Epoch 18/100
 - 8s - loss: 1.1230 - val_loss: 1.1480


Epoch 33/100
 - 9s - loss: 1.0909 - val_loss: 1.1314
Epoch 34/100
 - 8s - loss: 1.0890 - val_loss: 1.1304
Epoch 35/100
 - 8s - loss: 1.0868 - val_loss: 1.1375
Epoch 36/100
 - 8s - loss: 1.0849 - val_loss: 1.1297
Epoch 00036: early stopping
Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 10s - loss: 1.4503 - val_loss: 1.2616
Epoch 2/100
 - 9s - loss: 1.2298 - val_loss: 1.2256
Epoch 3/100
 - 9s - loss: 1.2050 - val_loss: 1.2098
Epoch 4/100
 - 9s - loss: 1.1928 - val_loss: 1.2036
Epoch 5/100
 - 9s - loss: 1.1825 - val_loss: 1.1958
Epoch 6/100
 - 9s - loss: 1.1737 - val_loss: 1.1831
Epoch 7/100
 - 9s - loss: 1.1657 - val_loss: 1.1803
Epoch 8/100
 - 9s - loss: 1.1599 - val_loss: 1.1739
Epoch 9/100
 - 9s - loss: 1.1543 - val_loss: 1.1695
Epoch 10/100
 - 9s - loss: 1.1506 - val_loss: 1.1696
Epoch 11/100
 - 9s - loss: 1.1467 - val_loss: 1.1689
Epoch 12/100
 - 9s - loss: 1.1431 - val_loss: 1.1615
Epoch 13/100
 - 9s - loss: 1.1399 - val_loss: 1.1599
Epoch 14/100
 - 9s - loss: 1

In [65]:
x_train_ann_class, x_test_ann_class, y_train_ann_class, y_test_ann_class = train_test_split(x, one_hot_stars, test_size=0.25, random_state=42)
checkpointer = ModelCheckpoint(filepath="dnn/best_weights_fairclass2.hdf5", verbose=0, save_best_only=True) # save best model

for i in range(5):
    model = Sequential()
    model.add(Dense(50, input_dim=x.shape[1], activation='relu')) # Hidden 1
    model.add(Dense(35, activation='relu')) # Hidden 2
    #model.add(Dense(30, activation='relu')) # Hidden 3
    model.add(Dense(one_hot_stars.shape[1], activation='softmax')) # Output
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
    model.fit(x_train_ann_class, y_train_ann_class, validation_data=(x_test_ann_class,y_test_ann_class), callbacks=[monitor,checkpointer], verbose=2, epochs=100)  

    

model.load_weights('dnn/best_weights_fairclass2.hdf5')
pred = model.predict(x_test_ann_class)
pred = np.argmax(pred,axis=1)
y_true= np.argmax(y_test_ann_class,axis=1) 
score = metrics.accuracy_score(y_true, pred)
print("Accuracy score: {}".format(score))
print("Confusion Matrix: {}".format(metrics.confusion_matrix(y_true, pred)))

Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 19s - loss: 1.2638 - val_loss: 1.2023
Epoch 2/100
 - 13s - loss: 1.1712 - val_loss: 1.1828
Epoch 3/100
 - 11s - loss: 1.1443 - val_loss: 1.1681
Epoch 4/100
 - 11s - loss: 1.1255 - val_loss: 1.1641
Epoch 5/100
 - 11s - loss: 1.1108 - val_loss: 1.1571
Epoch 6/100
 - 11s - loss: 1.0981 - val_loss: 1.1587
Epoch 7/100
 - 11s - loss: 1.0865 - val_loss: 1.1563
Epoch 8/100
 - 11s - loss: 1.0768 - val_loss: 1.1652
Epoch 9/100
 - 11s - loss: 1.0678 - val_loss: 1.1722
Epoch 10/100
 - 11s - loss: 1.0592 - val_loss: 1.1685
Epoch 00010: early stopping
Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 13s - loss: 1.2660 - val_loss: 1.2037
Epoch 2/100
 - 11s - loss: 1.1728 - val_loss: 1.1771
Epoch 3/100
 - 11s - loss: 1.1448 - val_loss: 1.1685
Epoch 4/100
 - 11s - loss: 1.1272 - val_loss: 1.1721
Epoch 5/100
 - 11s - loss: 1.1123 - val_loss: 1.1688
Epoch 6/100
 - 11s - loss: 1.0997 - val_loss: 1.1632
Epoch 7/100
 - 11s - 

In [63]:
x_train_ann_class, x_test_ann_class, y_train_ann_class, y_test_ann_class = train_test_split(x, one_hot_stars, test_size=0.25, random_state=42)
checkpointer = ModelCheckpoint(filepath="dnn/best_weights_fairclass3.hdf5", verbose=0, save_best_only=True) # save best model

for i in range(5):
    model = Sequential()
    model.add(Dense(40, input_dim=x.shape[1], activation='relu')) # Hidden 1
    model.add(Dense(20)) # Hidden 2
    #model.add(Dense(30, activation='relu')) # Hidden 3
    model.add(Dense(one_hot_stars.shape[1], activation='softmax')) # Output
    model.compile(loss='categorical_crossentropy', optimizer='sgd')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
    model.fit(x_train_ann_class, y_train_ann_class, validation_data=(x_test_ann_class,y_test_ann_class), callbacks=[monitor,checkpointer], verbose=2, epochs=100)  

    

model.load_weights('dnn/best_weights_fairclass3.hdf5')
pred = model.predict(x_test_ann_class)
pred = np.argmax(pred,axis=1)
y_true= np.argmax(y_test_ann_class,axis=1) 
score = metrics.accuracy_score(y_true, pred)
print("Accuracy score: {}".format(score))
print("Confusion Matrix: {}".format(metrics.confusion_matrix(y_true, pred)))

Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 19s - loss: 1.8696 - val_loss: 1.5754
Epoch 2/100
 - 15s - loss: 1.4202 - val_loss: 1.3393
Epoch 3/100
 - 14s - loss: 1.3012 - val_loss: 1.2883
Epoch 4/100
 - 14s - loss: 1.2662 - val_loss: 1.2682
Epoch 5/100
 - 15s - loss: 1.2513 - val_loss: 1.2595
Epoch 6/100
 - 16s - loss: 1.2426 - val_loss: 1.2562
Epoch 7/100
 - 16s - loss: 1.2366 - val_loss: 1.2649
Epoch 8/100
 - 15s - loss: 1.2321 - val_loss: 1.2451
Epoch 9/100
 - 17s - loss: 1.2282 - val_loss: 1.2627
Epoch 10/100
 - 17s - loss: 1.2252 - val_loss: 1.2580
Epoch 11/100
 - 16s - loss: 1.2224 - val_loss: 1.2391
Epoch 12/100
 - 18s - loss: 1.2195 - val_loss: 1.2369
Epoch 13/100
 - 20s - loss: 1.2168 - val_loss: 1.2377
Epoch 14/100
 - 16s - loss: 1.2145 - val_loss: 1.2340
Epoch 15/100
 - 15s - loss: 1.2121 - val_loss: 1.2479
Epoch 16/100
 - 16s - loss: 1.2098 - val_loss: 1.2385
Epoch 17/100
 - 14s - loss: 1.2074 - val_loss: 1.2588
Epoch 18/100
 - 16s - loss: 1.2048 - val

Epoch 22/100
 - 9s - loss: 1.1740 - val_loss: 1.2001
Epoch 23/100
 - 9s - loss: 1.1711 - val_loss: 1.1986
Epoch 24/100
 - 10s - loss: 1.1683 - val_loss: 1.1980
Epoch 25/100
 - 9s - loss: 1.1656 - val_loss: 1.2348
Epoch 26/100
 - 9s - loss: 1.1627 - val_loss: 1.1873
Epoch 27/100
 - 9s - loss: 1.1605 - val_loss: 1.1935
Epoch 28/100
 - 9s - loss: 1.1579 - val_loss: 1.1900
Epoch 29/100
 - 9s - loss: 1.1558 - val_loss: 1.1991
Epoch 30/100
 - 9s - loss: 1.1537 - val_loss: 1.1828
Epoch 31/100
 - 9s - loss: 1.1515 - val_loss: 1.2049
Epoch 32/100
 - 9s - loss: 1.1496 - val_loss: 1.2015
Epoch 33/100
 - 9s - loss: 1.1474 - val_loss: 1.2076
Epoch 34/100
 - 9s - loss: 1.1457 - val_loss: 1.1931
Epoch 35/100
 - 9s - loss: 1.1439 - val_loss: 1.1883
Epoch 00035: early stopping
Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 10s - loss: 1.9359 - val_loss: 1.7391
Epoch 2/100
 - 9s - loss: 1.5154 - val_loss: 1.3809
Epoch 3/100
 - 9s - loss: 1.3259 - val_loss: 1.2986
Epoch 4/100
 - 9s - l

In [62]:
x_train_ann_class, x_test_ann_class, y_train_ann_class, y_test_ann_class = train_test_split(x, one_hot_stars, test_size=0.25, random_state=42)
checkpointer = ModelCheckpoint(filepath="dnn/best_weights_fairclass4.hdf5", verbose=0, save_best_only=True) # save best model

for i in range(5):
    model = Sequential()
    model.add(Dense(40, input_dim=x.shape[1], activation='relu')) # Hidden 1
    model.add(Dense(20, activation='sigmoid')) # Hidden 2
    #model.add(Dense(30, activation='relu')) # Hidden 3
    model.add(Dense(one_hot_stars.shape[1], activation='softmax')) # Output
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
    model.fit(x_train_ann_class, y_train_ann_class, validation_data=(x_test_ann_class,y_test_ann_class), callbacks=[monitor,checkpointer], verbose=2, epochs=100)  

    

model.load_weights('dnn/best_weights_fairclass4.hdf5')
pred = model.predict(x_test_ann_class)
pred = np.argmax(pred,axis=1)
y_true= np.argmax(y_test_ann_class,axis=1) 
score = metrics.accuracy_score(y_true, pred)
print("Accuracy score: {}".format(score))
print("Confusion Matrix: {}".format(metrics.confusion_matrix(y_true, pred)))

Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 10s - loss: 1.3624 - val_loss: 1.2511
Epoch 2/100
 - 9s - loss: 1.2220 - val_loss: 1.2266
Epoch 3/100
 - 9s - loss: 1.2008 - val_loss: 1.2130
Epoch 4/100
 - 9s - loss: 1.1833 - val_loss: 1.1974
Epoch 5/100
 - 9s - loss: 1.1680 - val_loss: 1.1864
Epoch 6/100
 - 9s - loss: 1.1564 - val_loss: 1.1833
Epoch 7/100
 - 9s - loss: 1.1481 - val_loss: 1.1776
Epoch 8/100
 - 9s - loss: 1.1411 - val_loss: 1.1755
Epoch 9/100
 - 9s - loss: 1.1355 - val_loss: 1.1775
Epoch 10/100
 - 9s - loss: 1.1307 - val_loss: 1.1777
Epoch 11/100
 - 9s - loss: 1.1264 - val_loss: 1.1680
Epoch 12/100
 - 9s - loss: 1.1230 - val_loss: 1.1700
Epoch 13/100
 - 9s - loss: 1.1193 - val_loss: 1.1724
Epoch 14/100
 - 9s - loss: 1.1166 - val_loss: 1.1682
Epoch 15/100
 - 9s - loss: 1.1138 - val_loss: 1.1672
Epoch 16/100
 - 8s - loss: 1.1111 - val_loss: 1.1716
Epoch 00016: early stopping
Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 10s - loss: 1.3

In [73]:
x_train_ann_class, x_test_ann_class, y_train_ann_class, y_test_ann_class = train_test_split(x, one_hot_stars, test_size=0.25, random_state=42)
checkpointer = ModelCheckpoint(filepath="dnn/best_weights_fairclass1.hdf5", verbose=0, save_best_only=True) # save best model

for i in range(5):
    model = Sequential()
    model.add(Dense(100, input_dim=x.shape[1], activation='relu')) # Hidden 1
    model.add(Dense(85, activation='relu')) # Hidden 2
    model.add(Dense(65, activation='relu')) # Hidden 3
    model.add(Dense(one_hot_stars.shape[1], activation='softmax')) # Output
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
    model.fit(x_train_ann_class, y_train_ann_class, validation_data=(x_test_ann_class,y_test_ann_class), callbacks=[monitor,checkpointer], verbose=2, epochs=100)  

    

model.load_weights('dnn/best_weights_fairclass1.hdf5')
pred = model.predict(x_test_ann_class)
pred = np.argmax(pred,axis=1)
y_true= np.argmax(y_test_ann_class,axis=1) 
score = metrics.accuracy_score(y_true, pred)
print("Accuracy score: {}".format(score))
print("Confusion Matrix: {}".format(metrics.confusion_matrix(y_true, pred)))

Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 23s - loss: 1.2398 - val_loss: 1.1882
Epoch 2/100
 - 20s - loss: 1.1542 - val_loss: 1.1591
Epoch 3/100
 - 21s - loss: 1.1224 - val_loss: 1.1534
Epoch 4/100
 - 21s - loss: 1.0995 - val_loss: 1.1553
Epoch 5/100
 - 21s - loss: 1.0770 - val_loss: 1.1590
Epoch 6/100
 - 20s - loss: 1.0550 - val_loss: 1.1533
Epoch 7/100
 - 22s - loss: 1.0337 - val_loss: 1.1703
Epoch 8/100
 - 21s - loss: 1.0155 - val_loss: 1.1971
Epoch 00008: early stopping
Train on 141444 samples, validate on 47149 samples
Epoch 1/100
 - 25s - loss: 1.2359 - val_loss: 1.1953
Epoch 2/100
 - 24s - loss: 1.1492 - val_loss: 1.1623
Epoch 3/100
 - 29s - loss: 1.1193 - val_loss: 1.1483
Epoch 4/100
 - 22s - loss: 1.0932 - val_loss: 1.1434
Epoch 5/100
 - 25s - loss: 1.0710 - val_loss: 1.1541
Epoch 6/100
 - 25s - loss: 1.0495 - val_loss: 1.1712
Epoch 7/100
 - 24s - loss: 1.0292 - val_loss: 1.1819
Epoch 8/100
 - 20s - loss: 1.0094 - val_loss: 1.1799
Epoch 9/100
 - 23s - l