In [134]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from matplotlib import pyplot as plt

regr = RandomForestRegressor(max_depth=2,random_state=0)

In [135]:
import csv
import os
import numpy as np
import pandas
dir_path = os.path.dirname(os.path.realpath('feature_selection.ipynb'))
datasets_folder = dir_path +'/datasets'

filenames = []
for file in os.listdir(datasets_folder):
    filenames.append(os.fsdecode(file))
print(filenames)

['2_201_S.csv', '2_137_N.csv', '1_137_S.csv', '1_120_N.csv', '2_221_S.csv', '2_120_S.csv', '1_115_S.csv', '2_235_N.csv', '2_127_S.csv', '1_112_S.csv', '1_101_N.csv', '2_247_S.csv', '2_213_N.csv', '1_103_N.csv', '1_127_N.csv', '2_224_S.csv', '2_132_N.csv', '2_239_S.csv', '1_142_N.csv', '2_213_S.csv', '1_103_S.csv', '1_127_S.csv', '2_224_N.csv', '2_235_S.csv', '2_127_N.csv', '1_112_N.csv', '1_101_S.csv', '2_247_N.csv', '2_132_S.csv', '2_239_N.csv', '1_142_S.csv', '1_124_S.csv', '1_137_N.csv', '1_119_S.csv', '2_201_N.csv', '2_137_S.csv', '2_221_N.csv', '2_120_N.csv', '1_115_N.csv', '1_120_S.csv']


In [109]:
from sklearn.metrics import mean_squared_error

def model_fit(model,X_train,X_test,y_train,y_test):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_train)
    #y_pred = np.ceil(y_pred)
    mse = mean_squared_error(y_train,y_pred)
    
    #plt.plot(y_train,'ro')
    #plt.plot(y_pred,'bo')
    #plt.show()
    
    y_pred_test = model.predict(X_test)
    #y_pred_test = np.ceil(y_pred_test)
    
    return model.feature_importances_,mse,y_pred_test

# random forest

In [136]:
from sklearn.model_selection import train_test_split

def rf():
    sum_mse_train = 0
    sum_mse_test = 0
    for filename in filenames:
        filename_new = filename.replace('.csv','')
        line,stationID,stationDIR = filename_new.split('_')
        cur_dir = datasets_folder+'/'+filename
        data = pandas.read_csv(cur_dir,header=None)
        data.columns = ['month','date','hour','temp','pressure','humidity',
                       'wind_speed','wind_direction','clouds','weather_code',
                       'minute','delay']
        X = data[['hour','temp','pressure','humidity','wind_speed','clouds','weather_code','minute']]
        y = data.iloc[:,11]
        X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

        importances, mse, y_pred_test = model_fit(regr,X_train,X_test,y_train,y_test)
        sum_mse_train += mse
        sum_mse_test += mean_squared_error(y_test,y_pred_test)
        #print('new station')
        #print(importances)
    return sum_mse_train/len(filenames),sum_mse_test/len(filenames)

mse_train = 0
mse_test = 0
for i in range(10):
    train,test = rf()
    mse_train += train
    mse_test += test
print(mse_train/10)
print(mse_test/10)

3.8862287167203027
4.043676663597375


# parameter selection 

In [132]:
from sklearn.model_selection import GridSearchCV
import json

dict_para = {}
for filename in filenames:
    filename_new = filename.replace('.csv','')
    line,stationID,stationDIR = filename_new.split('_')
    cur_dir = datasets_folder+'/'+filename
    data = pandas.read_csv(cur_dir,header=None)
    data.columns = ['month','date','hour','temp','pressure','humidity',
                   'wind_speed','wind_direction','clouds','weather_code',
                   'minute','delay']
    X = data[['hour','temp','pressure','humidity','wind_speed','clouds','minute']]
    y = data.iloc[:,11]
    X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

    tuned_parameters = [{'kernel':['rbf'],
                         'gamma':[1e-3,1e-4,1e-2,0.1,1],
                         'alpha':[0.1,1,10,100,1000]}]

    clf = GridSearchCV(KernelRidge(),tuned_parameters,cv=5)
    clf.fit(X_train,y_train)
    
    dict_para[filename_new] = clf.best_params_
model_param_dir = dir_path+'/models/model_param/krr_param.json'


with open(model_param_dir, 'w') as fp:
    json.dump(dict_para, fp)

# kernel ridge regression 

In [148]:
from sklearn.kernel_ridge import KernelRidge

def krr_fit(X_train,X_test,y_train,y_test,key):
    with open(model_param_dir, 'r') as fp:
        data = json.load(fp)
    
    kernel_param = data[key]['kernel']
    gamma_param = data[key]['gamma']
    alpha_param = data[key]['alpha']
    
    clf = KernelRidge(alpha=alpha_param,kernel = kernel_param,gamma = gamma_param)
    clf.fit(X_train, y_train)
    
    y_train_pred = clf.predict(X_train)
    y_test_pred = clf.predict(X_test)
    train_size = y_train_pred.shape
    test_size = y_test_pred.shape
    #fool_train = np.zeros(train_size)
    #fool_test = np.zeros(test_size)
    
    mse_train = mean_squared_error(y_train,y_train_pred)
    mse_test = mean_squared_error(y_test,y_test_pred)
    
    return mse_train, mse_test

def krr():
    sum_mse_train = 0
    sum_mse_test = 0
    for filename in filenames:
        filename_new = filename.replace('.csv','')
        line,stationID,stationDIR = filename_new.split('_')
        cur_dir = datasets_folder+'/'+filename
        data = pandas.read_csv(cur_dir,header=None)
        data.columns = ['month','date','hour','temp','pressure','humidity',
                       'wind_speed','wind_direction','clouds','weather_code',
                       'minute','delay']
        X = data[['hour','temp','pressure','humidity','wind_speed','clouds','minute']]
        y = data.iloc[:,11]
        X_train,X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

        mse_train, mse_test = krr_fit(X_train,X_test,y_train,y_test,filename_new)
        sum_mse_train += mse_train
        sum_mse_test += mse_test
        #print('new station')
        #print(importances)
    return sum_mse_train/len(filenames),sum_mse_test/len(filenames)

mse_train = 0
mse_test = 0
for i in range(5):
    train,test = krr()
    mse_train += train
    mse_test += test
print(mse_train/5)
print(mse_test/5)

3.9800765558384086
4.099357077594718


In [None]:
LSTM (currently doing)

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout

def rnn_regr(X_train,X_test,y_train,y_test)
    #Initialize the RNN
    regressor = Sequential()
    regressor.add(LSTM(units=50,return_sequences=True, input_shape=(X_train.shape[1],1)))
    regressor.add(Dropout(0.2))

    # add a second LSTM layer
    regressor.add(LSTM(units = 50, return_sequences = True))
    regressor.add(Dropout(0.2))

    # add a third LSTM layer
    regressor.add(LSTM(units = 50, return_sequences = True))
    regressor.add(Dropout(0.2))

    # add a forth LSTM layer
    regressor.add(LSTM(units = 50, return_sequences = False))
    regressor.add(Dropout(0.2))

    # add a output layer
    regressor.add(Dense(units = 1))

    # Compiling the RNN
    regressor.compile(optimizer = 'adam', loss = 'mean_squared_error')

    # Fitting the RNN to the Training set
    regressor.fit(X_train, y_train, epochs = 100, batch_size = 32)
    
    regressor.predict()

In [153]:
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range = (0,1))

def dataset_reshape(scaled):
    X_train = []
    y_train = []
    
    length = scaled.shape[0]
    for i in range(60,length):
        X_train.append(scaled[i-60:i,0])
        y_train.append(scaled[i,0])
    X_train,y_train = np.array(X_train),np.array(y_train)
    X_train = np.reshape(X_train,(X_train.shape[0],X_train.shape[1],1))
    return X_train,y_train

for filename in filenames:
    filename_new = filename.replace('.csv','')
    line,stationID,stationDIR = filename_new.split('_')
    cur_dir = datasets_folder+'/'+filename
    data = pandas.read_csv(cur_dir,header=None)
    data.columns = ['month','date','hour','temp','pressure','humidity',
                   'wind_speed','wind_direction','clouds','weather_code',
                   'minute','delay']
    training_set = data.iloc[:,11:].values
    #training_set_reshape = training_set.reshape(-1,1)
    data_scaled = sc.fit_transform(training_set)
    train, test = train_test_split(data_scaled,test_size=0.1)
    X_train,y_train = dataset_reshape(train)
    X_test,y_test = dataset_reshape(test)
    
    
    
    
    
    
    



(1202, 1)
(134, 1)
(1326, 1)
(148, 1)
(2029, 1)
(226, 1)
(1979, 1)
(220, 1)
(1374, 1)
(153, 1)
(1385, 1)
(154, 1)
(1971, 1)
(220, 1)
(1224, 1)
(136, 1)
(1384, 1)
(154, 1)
(1913, 1)
(213, 1)
(2102, 1)
(234, 1)
(1265, 1)
(141, 1)
(1347, 1)
(150, 1)
(2022, 1)
(225, 1)
(1980, 1)
(220, 1)
(1369, 1)
(153, 1)
(1332, 1)
(149, 1)
(1285, 1)
(143, 1)
(1952, 1)
(217, 1)
(1369, 1)
(153, 1)
(1917, 1)
(213, 1)
(2004, 1)
(223, 1)
(1332, 1)
(148, 1)
(1285, 1)
(143, 1)
(1341, 1)
(150, 1)
(2166, 1)
(241, 1)
(1781, 1)
(198, 1)
(1168, 1)
(130, 1)
(1386, 1)
(154, 1)
(1229, 1)
(137, 1)
(2033, 1)
(226, 1)
(1945, 1)
(217, 1)
(1985, 1)
(221, 1)
(1980, 1)
(221, 1)
(1204, 1)
(134, 1)
(1392, 1)
(155, 1)
(1346, 1)
(150, 1)
(1342, 1)
(150, 1)
(2182, 1)
(243, 1)
(1987, 1)
(221, 1)
