In [1]:
#import libraries
import numpy as np
import pandas as pd

In [2]:
#all stock code
excel = ['5014', '1015', '6888', '1023', '7277', '6947', '4715', '3182', 
         '3034', '5168', '5819', '1082', '5225', '1961', '2445', '6012', 
         '1155', '3816', '4707', '1295', '5183', '5681', '6033', '8869', 
         '4065', '1066', '4197', '5285', '5347', '7113']

    

In [3]:
#Data Cleaning - remove empty rows for each excel
for i in range(len(excel)):
    
    #Read each of the file
    fileName = excel[i] + ".KL.csv"
    df = pd.read_csv(fileName)
    
    #No of row before remove
    rowBefore = df.shape[0]
    df = df.dropna()
    
    #No of row after remove
    rowAfter = df.shape[0]
    
    #Total removed rows
    removedRow = rowBefore - rowAfter
    print(excel[i] + " > Total of %s rows removed"%removedRow)
    
    #Save as new csv file
    newFileName = excel[i] + ".clean.csv"
    df.to_csv(newFileName,index = False)

5014 > Total of 0 rows removed
1015 > Total of 2694 rows removed
6888 > Total of 0 rows removed
1023 > Total of 0 rows removed
7277 > Total of 0 rows removed
6947 > Total of 0 rows removed
4715 > Total of 0 rows removed
3182 > Total of 0 rows removed
3034 > Total of 0 rows removed
5168 > Total of 0 rows removed
5819 > Total of 0 rows removed
1082 > Total of 0 rows removed
5225 > Total of 0 rows removed
1961 > Total of 0 rows removed
2445 > Total of 0 rows removed
6012 > Total of 0 rows removed
1155 > Total of 0 rows removed
3816 > Total of 0 rows removed
4707 > Total of 0 rows removed
1295 > Total of 0 rows removed
5183 > Total of 0 rows removed
5681 > Total of 0 rows removed
6033 > Total of 0 rows removed
8869 > Total of 0 rows removed
4065 > Total of 0 rows removed
1066 > Total of 0 rows removed
4197 > Total of 0 rows removed
5285 > Total of 0 rows removed
5347 > Total of 0 rows removed
7113 > Total of 0 rows removed


In [4]:
#Data Preprocessing - prepare Data for LSTM Modelling

#split training and testing set
def splitTrainTest(df,num,dump = None):
    trainSet = df[0:-num]
    testSet = df[-num:dump]
    return (trainSet, testSet)

#prepare X_train and y_train to train LSTM Model
def prepareTrainSet(trainSet):
    #closing price only will be used for training
    xTrain = trainSet.iloc[:,4:5]

    # Feature Scaling
    from sklearn.preprocessing import MinMaxScaler
    sc = MinMaxScaler(feature_range = (0, 1))
    training_set_scaled = sc.fit_transform(xTrain)

    # Creating a data structure with 60 timesteps and 1 output
    X_train = []
    y_train = []
    for i in range(60, training_set_scaled.shape[0]):
        X_train.append(training_set_scaled[i-60:i, 0])
        y_train.append(training_set_scaled[i, 0])
        
    # reshape X_train to 3D
    X_train, y_train = np.array(X_train), np.array(y_train)
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
    
    #note: sc is returned for prepare test set to consume  
    return(X_train, y_train, sc)

#prepare Test Set for LSTM verification
def prepareTestSet(df,testSet,sc):
    #closing price only will be used for testing
    dataset_total = df.iloc[:,4:5]
    
    #get additional 60 days before test set to perform prediction
    inputs = dataset_total[len(dataset_total) - len(testSet) - 60:].values
    
    #reshape 
    inputs = inputs.reshape(-1,1)
    inputs = sc.transform(inputs)
    
    #prepare X_test
    X_test = []
    for i in range(60, inputs.shape[0]):
        X_test.append(inputs[i-60:i, 0])
        
    #reshape into 3D array
    X_test = np.array(X_test)
    X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
    
    #actual stock price during test period
    actualStockPrice = testSet.iloc[:,4:5].values
    return (X_test, actualStockPrice)

In [5]:
# main coding for LSTM training and testing
for i in range(len(excel)):
    
    #read data
    fileName = excel[i] + ".clean.csv"
    df = pd.read_csv(fileName)
    
    #split train test
    trainSet, testSet = splitTrainTest(df,60)
    X_train, y_train, sc = prepareTrainSet(trainSet)
    
    ## LSTM trainig code will be insert here in the future

    
    #prepare test sets
    X_test, actualStockPrice = prepareTestSet(df,testSet,sc)
    print(excel[i] + " > Data Preparation Completed")

    
    

5014 > Data Preparation Completed
1015 > Data Preparation Completed
6888 > Data Preparation Completed
1023 > Data Preparation Completed
7277 > Data Preparation Completed
6947 > Data Preparation Completed
4715 > Data Preparation Completed
3182 > Data Preparation Completed
3034 > Data Preparation Completed
5168 > Data Preparation Completed
5819 > Data Preparation Completed
1082 > Data Preparation Completed
5225 > Data Preparation Completed
1961 > Data Preparation Completed
2445 > Data Preparation Completed
6012 > Data Preparation Completed
1155 > Data Preparation Completed
3816 > Data Preparation Completed
4707 > Data Preparation Completed
1295 > Data Preparation Completed
5183 > Data Preparation Completed
5681 > Data Preparation Completed
6033 > Data Preparation Completed
8869 > Data Preparation Completed
4065 > Data Preparation Completed
1066 > Data Preparation Completed
4197 > Data Preparation Completed
5285 > Data Preparation Completed
5347 > Data Preparation Completed
7113 > Data Pr