In [1]:
# Importing the libraries
import tqdm
import numpy as np
import pandas as pd
import pickle as pk
from time import time
from data import readTrainDataV0, readTestDataV0

In [2]:
import logging

logging.basicConfig(level=logging.DEBUG, format='%(asctime)-20s %(message)s', datefmt="%Y-%m-%d %H:%M:%S")
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

In [3]:
DATA_V0  = "../data/v0"
RES_OUT  = "../results/baseline"

In [4]:
class ModelBase:
    def __init__(self):
        pass
    
    def fit(self, x, y):
        """ Fit a row of data (x, y) """
        pass

    def prepare(self):
        """ Optional. Called between fit and predict """
        pass

    def predict(self, X):
        """ Predict y from x """
        pass

In [5]:
class ZeroPrediction(ModelBase):
    def predict(self, X): return np.zeros((X[1], X[1]))

In [6]:
class KNN(ModelBase):
    '''Find nearest neighbor, use its matrix '''
    def __init__(self):
        self.D = {}
    
    def fit(self,X,y):
        n = X[1]
        if n not in self.D:
            self.D[n] = []
        self.D[n].append((X[3],y))
    
    def predict(self,X):
        n = X[1]
        if n not in self.D:
            # set n_search to be the smallest key >= n
            keys = sorted(self.D.keys())
            if keys[-1] < n:
                logger.debug('alert: zero prediction!!')
                return np.zeros((n,n))
            else:
                logger.debug(f"alert: cropped prediction!! {n}")
                for k in range(len(keys)):
                    if keys[k] >= n:
                        n_search = keys[k]
                        break
        else:
            n_search = n
        
        Data = self.D[n_search]
        best_matrix = None
        best_LCS = 0
        # Find the most similar matrix
        for k in range(len(Data)):
            s, matrix = Data[k]
            cnt_LCS = LCS(s,X[3])
            if cnt_LCS >= best_LCS:
                best_matrix = matrix
                best_LCS = cnt_LCS
        return best_matrix[0:n,0:n]   

In [7]:
class Avg_Distance(ModelBase):
    '''calculate the average of each distance'''
    def __init__(self):
        self.data = {}
    
    def fit(self,x,y):
        n = y.shape[0]
        for i in range(n):
            for j in range(i,n):
                Distance = j - i
                Value = y[i,j]
                
                if Distance in self.data:
                    avgValue, count = self.data[Distance]
                    self.data[Distance] = ((avgValue*count+Value)/(count+1),count+1)
                else:
                    self.data[Distance] = (Value,1)
    
    # called once before predict, optional
    def prepare(self):
        # set -1 to the average value of the largest distance
        # used when encounter a new distance
        self.data[-1] = self.data[max(self.data.keys())]
    
    def predict(self,x):
        n = x[1]
        y = np.zeros((n,n))
        for i in range(n):
            for j in range(i + 1, n):
                Dis = j - i
                if Dis in self.data:
                    Val = self.data[Dis][0]
                else:
                    Val = self.data[-1][0]
                y[i,j] = y[j,i] = Val
        return y


In [8]:
class Avg_Matrix(ModelBase):
    ''' Find matrices with similar dimension
        return the average of all these matrices '''
    def __init__(self):
        self.D = {}
    
    def fit(self,X,y):
        n = X[1]
        if n not in self.D:
            self.D[n] = y, 1
        else:
            matrix, count = self.D[n]
            self.D[n] = (matrix*count+y)/(count + 1), count + 1
    
    def predict(self,X):
        n = X[1]
        if n not in self.D:
            # set n_search to be the smallest key >= n
            keys = sorted(self.D.keys())
            if keys[-1] < n:
                logger.debug('alert: zero prediction!!')
                return np.zeros((n,n))
            else:
                logger.debug(f"alert: cropped prediction!! {n}")
                for k in range(len(keys)):
                    if keys[k] >= n:
                        n_search = keys[k]
                        break
        else:
            n_search = n
        # return the average of all matrices of length n_search
        # than crop it to [0:n,0:n]
        return self.D[n_search][0][0:n,0:n]

In [9]:
def LCS(str1,str2):
    n1 = len(str1)
    n2 = len(str2)
    D = np.zeros((n1+1,n2+1),dtype=int)
    for i in range(0,n1):
        for j in range(0,n2):
            # calculate D[i+1][j+1]
            D[i+1][j+1] = max(D[i][j+1], D[i+1][j])
            if str1[i] == str2[j]:
                D[i+1][j+1] = max(D[i+1][j+1],1+D[i][j])
    return D[n1][n2]

def train_validate(model_class):
    ''' Train/Test on 80/20 split on v0 data '''
    model = model_class()

    logger.info(f"Training {model_class.__name__}")
    n, X, Y = readTrainDataV0('80')
    for k in tqdm.tqdm(range(n)):
        model.fit(X[k],Y[k])
    model.prepare()
    
    logger.info(f"Running {model_class.__name__} predictions ")
    n, X, Y = readTrainDataV0('20')
    SE = []
    for k in tqdm.tqdm(range(n)):
        y_pred = model.predict(X[k])
        SE.append(np.sum((y_pred - Y[k])**2))
    RMSE = np.sqrt(np.mean(SE))
    
    print(f"{model_class.__name__:20s} : {RMSE:.2f}")
    return RMSE

def train_test(model_class):
    ''' Train on 100% data. Predict on test data '''
    model = model_class()
    
    logger.info(f"Training {model_class.__name__}")
    n, X, Y = readTrainDataV0('100')
    for k in tqdm.tqdm(range(n)):
        model.fit(X[k],Y[k])
    model.prepare()
    
    logger.info(f"Running {model_class.__name__} predictions ")
    n, X = readTestDataV0()
    team_6 = []
    for k in tqdm.tqdm(range(n)):
        y_pred = model.predict(X[k])
        team_6.append(y_pred)
    np.savez(f"{RES_OUT}/{model_class.__name__}.npz", *team_6)

In [10]:
if __name__ == '__main__':
    # Results = ZeroPrediction: 6952.77, KNN:3576.94, Avg_Matrix:3024.67, Avg_Distance:2567.87
    models = [ZeroPrediction, KNN, Avg_Matrix, Avg_Distance]
    
    logger.info("Training/Validating on training data")
    with open("./baseline_log.log", "w") as log_file:
        for model in models:
            RMSE = train_validate(model)
            log_file.write(f"{model.__name__:20s} : {RMSE:.2f}\n")
        
    logger.info("Generating test dataset evaluations")
    for model in models: train_test(model)

2019-01-01 18:29:51  Training/Validating on training data
2019-01-01 18:29:51  Training ZeroPrediction
100%|█████████████████████████████████████████████████████████████████████████| 3643/3643 [00:00<00:00, 1826422.36it/s]
2019-01-01 18:29:51  Running ZeroPrediction predictions 
100%|██████████████████████████████████████████████████████████████████████████████| 911/911 [00:00<00:00, 3160.12it/s]


ZeroPrediction       : 6952.77


2019-01-01 18:29:52  Training KNN
100%|█████████████████████████████████████████████████████████████████████████| 3643/3643 [00:00<00:00, 1221020.41it/s]
2019-01-01 18:29:52  Running KNN predictions 
100%|████████████████████████████████████████████████████████████████████████████████| 911/911 [04:30<00:00,  2.75it/s]


KNN                  : 3576.94


2019-01-01 18:34:23  Training Avg_Matrix
100%|███████████████████████████████████████████████████████████████████████████| 3643/3643 [00:00<00:00, 16674.40it/s]
2019-01-01 18:34:24  Running Avg_Matrix predictions 
100%|██████████████████████████████████████████████████████████████████████████████| 911/911 [00:00<00:00, 7549.06it/s]


Avg_Matrix           : 3024.67


2019-01-01 18:34:24  Training Avg_Distance
100%|██████████████████████████████████████████████████████████████████████████████| 3643/3643 [01:24<00:00, 41.68it/s]
2019-01-01 18:35:49  Running Avg_Distance predictions 
100%|███████████████████████████████████████████████████████████████████████████████| 911/911 [00:07<00:00, 125.18it/s]


Avg_Distance         : 2567.87


2019-01-01 18:35:57  Generating test dataset evaluations
2019-01-01 18:35:57  Training ZeroPrediction
100%|█████████████████████████████████████████████████████████████████████████| 4554/4554 [00:00<00:00, 2249806.88it/s]
2019-01-01 18:35:58  Running ZeroPrediction predictions 
100%|█████████████████████████████████████████████████████████████████████████████| 224/224 [00:00<00:00, 13221.37it/s]
2019-01-01 18:35:58  Training KNN
100%|█████████████████████████████████████████████████████████████████████████| 4554/4554 [00:00<00:00, 1522830.30it/s]
2019-01-01 18:35:59  Running KNN predictions 
100%|████████████████████████████████████████████████████████████████████████████████| 224/224 [01:27<00:00,  2.29it/s]
2019-01-01 18:37:26  Training Avg_Matrix
100%|███████████████████████████████████████████████████████████████████████████| 4554/4554 [00:00<00:00, 15998.11it/s]
2019-01-01 18:37:27  Running Avg_Matrix predictions 
100%|██████████████████████████████████████████████████████████████