In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
import datetime as dt

In [4]:
X_train = pd.read_csv("X_train.csv", sep=',')
Y_train = pd.read_csv("Y_train.csv", sep =',')

#Exclude index out of CSV file from matrix
X_train = X_train.iloc[:, 1:]
Y_train = Y_train.iloc[:, 1:]

#factorize the ratings train set
Y_train = Y_train["Rating as Factor"].astype("category")
Y_train = pd.DataFrame(Y_train, columns=["Rating as Factor"])

#Same as above but for test set
X_test = pd.read_csv("X_test.csv", sep=',')
Y_test = pd.read_csv("Y_test.csv", sep =',')

#Exclude index out of CSV file from matrix
X_test = X_test.iloc[:, 1:]
Y_test = Y_test.iloc[:, 1:]

#factorize the ratings for test set
Y_test = Y_test["Rating as Factor"].astype("category")
Y_test = pd.DataFrame(Y_test, columns=["Rating as Factor"])

In [5]:
#Stadardize Feature Matrix 

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train.iloc[:,:])  # fit & transform
X_test_std  = stdsc.transform(X_test.iloc[:,:])  # ONLY transform

In [50]:
def neural(x_train, y_train, x_test, y_test):
    """Apply Neural Network Classifier and get test score on test set.
    
    Args:
        x_train: train feature matrix
        y_train: train classification input
        x_test: test feature matrix
        y_test: test classification input
    """
    #Measure Starting time
    start = dt.datetime.now()
    # Create classifier object and fit it to data
    neural = MLPClassifier(random_state=0)
    neural.fit(x_train, y_train)
    
    # Print test score 
    print('Test accuracy: {0: .4f}'.format(neural.score(x_test, y_test)))
    
    #Measure Ending time
    end = dt.datetime.now()
    
    #Print Time passed for code execution
    print("Time needed: " + str(end - start))

In [51]:
#Test function on unscaled feature matrix
neural(X_train, Y_train, X_test, Y_test)

  y = column_or_1d(y, warn=True)


Test accuracy:  0.4476
Time needed: 0:00:07.533557


In [42]:
#Test function on scaled feature matrix, better results
neural(X_train_std, Y_train, X_test_std, Y_test)

  y = column_or_1d(y, warn=True)


Test accuracy:  0.8247
Time needed: 0:00:20.763870




In [45]:
#Define arrays of values to be tested for paramgrid inside GridSearchCV function

#The ith element represents the number of neurons in the ith hidden layer.
hidden_lay = np.array([400])

# Max number of iterations
max_Iter = np.array([550])

In [53]:
def neural_g(x_train, y_train, x_test, y_test, hidden_layer, maxIter):
    """ Apply GridSearchCV with Neural Network Classifier. Best CV score, Test score and best parameters will be printed out.
    
    Args:
        x: train feature matrix
        y: train classification input
        x_test: test feature matrix
        y_test: test classification input
        hidden_layer: array of values which will be tested for variable hidden_layer_sizes
        maxIter: array of values which will be tested for variable max_iter
        
    """
    
    #Measure Starting time
    start = dt.datetime.now()
    
    # Define the hyperparameter values to be tested
    param_grid = {"hidden_layer_sizes" : hidden_layer,
                  'max_iter': maxIter},


    # Run brute-force grid search
    #solver "lbfgs" has proven to be the best
    gs = GridSearchCV(estimator=MLPClassifier(random_state=0, solver= "lbfgs", warm_start= True),
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv= 5, n_jobs=-1)
    gs = gs.fit(x_train, y_train)
    
    print('Best CV accuracy: {:.2f}'.format(gs.best_score_))
    print('Test score:       {:.2f}'.format(gs.score(x_test, y_test)))
    print('Best parameters: {}'.format(gs.best_params_))
    
    #Measure Ending time
    end = dt.datetime.now()
    
    #Print Time passed for code execution
    print("Time needed: " + str(end - start))

In [48]:
neural_g(X_train_std, Y_train, X_test_std, Y_test, hidden_lay, max_Iter)

  y = column_or_1d(y, warn=True)


Best CV accuracy: 0.96
Test score:       0.97
Best parameters: {'hidden_layer_sizes': 400, 'learning_rate': 'constant', 'max_iter': 550}
Time needed: 0:10:27.147516
