In [1]:
import abc

In [2]:
class method:
    '''
    MethodModule: Abstract Class
    Entries: method_name: the name of the MethodModule 
             method_description: the textual description of the MethodModule
             
             method_start_time: start running time of MethodModule
             method_stop_time: stop running time of MethodModule
             method_running_time: total running time of the MethodModule
             method_training_time: time cost of the training phrase
             method_testing_time: time cost of the testing phrase
    '''
    
    method_name = None
    method_description = None
    
    data = None
    
    method_start_time = None
    method_stop_time = None
    method_running_time = None
    method_training_time = None
    method_testing_time = None

    # initialization function
    def __init__(self, mName=None, mDescription=None):
        self.method_name = mName
        self.method_description = mDescription

    # running function
    @abc.abstractmethod
    def run(self, trainData, trainLabel, testData):
        return


In [3]:

class dataset:
    """ 
    dataset: Abstract Class 
    Entries: dataset_name: the name of the dataset
             dataset_description: the textual description of the dataset
    """
    
    dataset_name = None
    dataset_descrition = None
    
    dataset_source_folder_path = None
    dataset_source_file_name = None
    
    data = None
    
    # initialization function
    def __init__(self, dName=None, dDescription=None):
        '''
        Parameters: dataset name: dName, dataset description: dDescription
        Assign the parameters to the entries of the base class
        '''
        self.dataset_name = dName
        self.dataset_descrition = dDescription
    
    # information print function
    def print_dataset_information(self):
        '''
        Print the basic information about the dataset class
        inclduing the dataset name, and dataset description
        '''
        print('Dataset Name: ' + self.dataset_name)
        print('Dataset Description: ' + self.dataset_descrition)

    # dataset load abstract function
    @abc.abstractmethod
    def load(self):
        return
    
    

In [4]:

class evaluate:
    """ 
    evaluate: Abstract Class
    Entries: 
    """
    
    evaluate_name = None
    evaluate_description = None
    
    data = None
    
    # initialization function
    def __init__(self, eName=None, eDescription=None):
        self.evaluate_name = eName
        self.evaluate_description = eDescription

    @abc.abstractmethod
    def evaluate(self):
        return


In [5]:
class result:
    """
    ResultModule: Abstract Class
    Entries: 
    """
    
    data = None
    
    result_name = None
    result_description = None
    
    result_destination_folder_path = None
    result_destination_file_name = None
    
    # initialization function
    def __init__(self, rName=None, rType=None):
        self.result_name = rName
        self.result_description = rType

    @abc.abstractmethod
    def save(self):
        return
 
    @abc.abstractmethod
    def load(self):
        return

In [6]:
class setting:
    '''
    SettingModule: Abstract Class
    Entries: 
    '''
    
    setting_name = None
    setting_description = None
    
    dataset = None
    method = None
    result = None
    evaluate = None

    def __init__(self, sName=None, sDescription=None):
        self.setting_name = sName
        self.setting_description = sDescription
    
    def prepare(self, sDataset, sMethod, sResult, sEvaluate):
        self.dataset = sDataset
        self.method = sMethod
        self.result = sResult
        self.evaluate = sEvaluate

    def print_setup_summary(self):
        print('dataset:', self.dataset.dataset_name, ', method:', self.method.method_name,
              ', setting:', self.setting_name, ', result:', self.result.result_name, ', evaluation:', self.evaluate.evaluate_name)

    @abc.abstractmethod
    def load_run_save_evaluate(self):
        return

In [7]:

import csv

class Dataset_Loader(dataset):
    data = None
    dataset_source_folder_path = None
    dataset_source_file_name = None

    def __init__(self, dName=None, dDescription=None):
        super().__init__(dName, dDescription)
    
    def load(self):
        print('loading data...')
        X = []
        y = []
        with open(self.dataset_source_folder_path + self.dataset_source_file_name, 'r') as f:
            f_csv = csv.reader(f)
            for row in f_csv:
                elements=[int(i) for i in row]
                X.append(elements[1:])
                y.append(elements[0])
                #print(row[1:4])
                #print(row[0])
        f.close()
        return {'X': X, 'y': y}

In [8]:

from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score


class Evaluate_Accuracy(evaluate):
    data = None
    
    def evaluate(self):
        #print('evaluating performance...')
        return accuracy_score(self.data['true_y'], self.data['pred_y'])

    def full_evaluate(self, average):
        result=self.data
        print('testing',self.evaluate_name)
        print("overall acc: ", accuracy_score(result['true_y'], result['pred_y']))
        print(average, "pre: ", precision_score(result['true_y'], result['pred_y'], average=average))
        print(average, "recal: ", recall_score(result['true_y'], result['pred_y'], average=average))
        print(average, "f1: ", f1_score(result['true_y'], result['pred_y'], average=average))

In [9]:

import torch
from torch import nn
import numpy as np


class Method_CNN(method, nn.Module):
    data = None

    def __init__(self, mName, mDescription, _max_epoch, _learning_rate, device):
        self.device=device
        self.max_epoch = _max_epoch
        self.learning_rate = _learning_rate
        method.__init__(self, mName, mDescription)
        nn.Module.__init__(self)

        self.fc_layer_1 = nn.
        self.fc_layer_1 = nn.Linear(784, 80).to(self.device)

        self.activation_func_1 = nn.CELU()

        self.fc_layer_2 = nn.Linear(80, 10).to(self.device)
        
        self.activation_func_f = nn.Softmax(dim=1)


    def forward(self, x):
        '''Forward propagation'''
        # hidden layer embeddings
        h = self.activation_func_1(self.fc_layer_1(x))
        # outout layer result
        # self.fc_layer_2(h) will be a nx2 tensor
        # n (denotes the input instance number): 0th dimension; 2 (denotes the class number): 1st dimension
        # we do softmax along dim=1 to get the normalized classification probability distributions for each instance
        y_pred = self.activation_func_f(self.fc_layer_2(h))
        return y_pred

    # backward error propagation will be implemented by pytorch automatically
    # so we don't need to define the error backpropagation function here

    def train(self, X, y):
        # check here for the torch.optim doc: https://pytorch.org/docs/stable/optim.html
        optimizer = torch.optim.RMSprop(self.parameters(), lr=self.learning_rate)
        # check here for the gradient init doc: https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html

        # check here for the nn.CrossEntropyLoss doc: https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
        loss_function = nn.CrossEntropyLoss()
        # for training accuracy investigation purpose
        accuracy_evaluator = Evaluate_Accuracy('training evaluator', '')

        # it will be an iterative gradient updating process
        # we don't do mini-batch, we use the whole input as one batch
        # you can try to split X and y into smaller-sized batches by yourself
        loss_list=[]
        x=torch.FloatTensor(np.array(X)).to(self.device)
        y_true=torch.LongTensor(np.array(y)).to(self.device)
        for epoch in range(self.max_epoch): # you can do an early stop if self.max_epoch is too much...
            # get the output, we need to covert X into torch.tensor so pytorch algorithm can operate on it

            y_pred = self.forward(x)
            # convert y to torch.tensor as well

            # calculate the training loss

            train_loss = loss_function(y_pred, y_true)

            # check here for the loss.backward doc: https://pytorch.org/docs/stable/generated/torch.Tensor.backward.html
            # do the error backpropagation to calculate the gradients
            optimizer.zero_grad()
            train_loss.backward()
            # check here for the opti.step doc: https://pytorch.org/docs/stable/optim.html
            # update the variables according to the optimizer and the gradients calculated by the above loss.backward function
            optimizer.step()

            if epoch%100 == 0:
                loss_list.append(train_loss.item())
                accuracy_evaluator.data = {'true_y': y_true.cpu(), 'pred_y': y_pred.max(1)[1].cpu()}
                print('Epoch:', epoch, 'Accuracy:', accuracy_evaluator.evaluate(), 'Loss:', train_loss.item())
        return(loss_list)
    
    def test(self, X):
        # do the testing, and result the result
        y_pred = self.forward(torch.FloatTensor(np.array(X)).to(self.device))
        # convert the probability distributions to the corresponding labels
        # instances will get the labels corresponding to the largest probability
        return y_pred.max(1)[1]
    
    def run(self):
        print('method running...')
        print('--start training...')
        loss_list=self.train(self.data['train']['X'], self.data['train']['y'])
        print('--start testing...')
        pred_y = self.test(self.data['test']['X']).cpu()
        print(pred_y)
        print("pred: ", pred_y, " true: ", self.data['test']['y'])
        return {'result':{'pred_y': pred_y, 'true_y': self.data['test']['y']},'loss':loss_list}
            

In [10]:

import pickle


class Result_Saver(result):
    data = None
    fold_count = None
    result_destination_folder_path = None
    result_destination_file_name = None
    
    def save(self):
        print('saving results...')
        f = open(self.result_destination_folder_path + self.result_destination_file_name + '_' + str(self.fold_count), 'wb')
        pickle.dump(self.data, f)
        f.close()

In [11]:



from sklearn.model_selection import KFold
import numpy as np
import matplotlib.pyplot as pl

class Setting_KFold_CV(setting):
    train   =None
    test    =None
    method  =None
    result  =None
    evaluate=None
    test_data = None

    def load_run_save_evaluate(self,stage):
        # load dataset
        self.train_data = self.train.load()
        self.test_data=self.test.load()
        train_data = self.train_data
        test_data=self.test_data

        score_list = []
        loss_list=[]
        for i in range(stage):
            test_index  =   np.random.randint(10000,size=3)
           # train_index =   np.random.randint(60000,size=_size)

            X_train, X_test = np.array(train_data['X']), np.array(test_data['X'])[test_index]
            y_train, y_test = np.array(train_data['y']), np.array(test_data['y'])[test_index]

            # run MethodModule
            self.method.data = {'train': {'X': X_train, 'y': y_train}, 'test': {'X': X_test, 'y': y_test}}
            learned_result = self.method.run()

            self.evaluate.data = learned_result['result']
            score_list.append(self.evaluate.evaluate())

            loss_list+=learned_result['loss']
        pl.plot(range(0,len(loss_list)*100,100),loss_list,label='loss', color='purple')
        pl.show()
        return np.mean(score_list), np.std(score_list)

    def do_evaluate(self):
        train_data =self.train_data
        test_data = self.test_data
        #test_index = np.random.randint(10000, size=size)

        pred_y = self.method.test(np.array(train_data['X'])).cpu()
        true_y = np.array(train_data['y'])
        self.evaluate.data = {'pred_y': pred_y, 'true_y': true_y}
        self.result.data = self.evaluate.data
        self.result.fold_count = 98
        self.result.save()

        pred_y=self.method.test(np.array(test_data['X'])).cpu()
        true_y = np.array(test_data['y'])
        self.evaluate.data ={'pred_y': pred_y, 'true_y': true_y}
        self.result.data=self.evaluate.data
        self.result.fold_count=99
        self.result.save()


        return self.evaluate.evaluate()

    def __init__(self, sName=None, sDescription=None):
        self.setting_name = sName
        self.setting_description = sDescription

    def prepare(self, _train, _test, sMethod, sResult, sEvaluate):
        self.train=_train
        self.test=_test
        self.method = sMethod
        self.result = sResult
        self.evaluate = sEvaluate

    def print_setup_summary(self):
        print('trainset:', self.train.dataset_name, 'testset:', self.test.dataset_name, ', method:', self.method.method_name,
              ', setting:', self.setting_name, ', result:', self.result.result_name, ', evaluation:',
              self.evaluate.evaluate_name)

In [12]:

import numpy as np
import torch
import os
os.environ['CUDA_VISIBLE_DEVICES']= '0'
#---- Multi-Layer Perceptron script ----

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())
if 1:
    #---- parameter section -------------------------------
    np.random.seed(2)
    torch.manual_seed(2)
    #------------------------------------------------------

    # ---- objection initialization setction ---------------
    trainset_obj = Dataset_Loader('stage2train', '')
    trainset_obj.dataset_source_folder_path = '../../data/stage_2_data/'
    trainset_obj.dataset_source_file_name = 'train.csv'

    testset_obj = Dataset_Loader('stage2test', '')
    testset_obj.dataset_source_folder_path = '../../data/stage_2_data/'
    testset_obj.dataset_source_file_name = 'test.csv'



    result_obj = Result_Saver('saver', '')
    result_obj.result_destination_folder_path = '../../result/stage_2_result/MLP_'
    result_obj.result_destination_file_name = 'prediction_result'

    setting_obj = Setting_KFold_CV('k fold cross validation', '')
    #setting_obj = Setting_Tra
    # in_Test_Split('train test split', '')

    evaluate_obj = Evaluate_Accuracy('accuracy', '')
    # ------------------------------------------------------
    #for learning_rate in [10e-5,10e-6,10e-7]:
    #    for epoch in [500,1000,2000]:
    #        for size in [100,500,1000]:
    learning_rate=10e-5
    epoch=8000
    stage=1
    # ---- running section ---------------------------------
    method_obj = Method_CNN('multi-layer perceptron', '', epoch, learning_rate,device).to(device)
    print('************ Start ************')
    setting_obj.prepare(trainset_obj, testset_obj, method_obj, result_obj, evaluate_obj)
    setting_obj.print_setup_summary()
    mean_score, std_score = setting_obj.load_run_save_evaluate(stage)
    print('************ Overall Performance ************')
    print('MLP Accuracy: ' + str(mean_score) + ' +/- ' + str(std_score))
    print('************ final evaluation ************')
    performance = setting_obj.do_evaluate()
    print('final performance: ' + str(performance))
    print('************ Finish ************')

    # ------------------------------------------------------
    

    

True
************ Start ************
trainset: stage2train testset: stage2test , method: multi-layer perceptron , setting: k fold cross validation , result: saver , evaluation: accuracy
loading data...


KeyboardInterrupt: 