# SETUP

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import statsmodels.api as sm

  from numpy.core.umath_tests import inner1d


---

# load dataset

In [2]:
data_solar = pd.read_pickle('A_site_solar_Data_concat_02.pkl')
data_weather = pd.read_pickle('A_site_weather_Data_concat_02.pkl')
data_forecast = pd.read_csv('data_forecast_2018_02.csv')

In [15]:
data_forecast.tail()

Unnamed: 0.1,Unnamed: 0,time(prediction),time(46H later),temperature_3H,sky,precipitation_form,rainfall_probability,humidity,wind_speed,wind_direction
2915,2915,2018-12-31 11:00:00,2019-01-02 09:00:00,-2.0,1.0,0.0,0.0,70.0,5.1,313.0
2916,2916,2018-12-31 14:00:00,2019-01-02 12:00:00,-3.0,1.0,0.0,0.0,75.0,6.3,312.0
2917,2917,2018-12-31 17:00:00,2019-01-02 15:00:00,-3.0,1.0,0.0,0.0,85.0,6.9,326.0
2918,2918,2018-12-31 20:00:00,2019-01-02 18:00:00,-5.0,1.0,0.0,0.0,80.0,6.5,328.0
2919,2919,2018-12-31 23:00:00,2019-01-02 21:00:00,-6.0,1.0,0.0,0.0,80.0,5.7,333.0


In [26]:
# make forecast
data_forecast = pd.read_csv('data_forecast_2018_02.csv')
data_forecast.drop(['Unnamed: 0'], axis=1, inplace=True)

# make index
data_forecast.index = data_forecast['time(46H later)']
data_forecast.index = pd.to_datetime(data_forecast.index).values

# resample
data_forecast = data_forecast.resample('15min').interpolate(method='quadratic')

# drop columns
data_forecast.drop(['time(prediction)', 'time(46H later)', 'precipitation_form'], axis=1, inplace=True)
data_forecast.reset_index(inplace=True)

# change columns
data_forecast.columns = ['time', 'temperature_3H', 'sky', 
                         'rainfall_probability', 'humidity', 'wind_speed', 'wind_direction']

data_forecast.tail()

Unnamed: 0,time,temperature_3H,sky,rainfall_probability,humidity,wind_speed,wind_direction
35024,2019-01-02 20:00:00,-5.843447,1.000022,0.00022,78.984041,6.000419,330.615064
35025,2019-01-02 20:15:00,-5.899159,1.000019,0.000186,79.142784,5.928478,331.14396
35026,2019-01-02 20:30:00,-5.943821,1.000014,0.000138,79.365025,5.854428,331.717748
35027,2019-01-02 20:45:00,-5.977435,1.000008,7.6e-05,79.650764,5.778269,332.336428
35028,2019-01-02 21:00:00,-6.0,1.0,0.0,80.0,5.7,333.0


In [27]:
# data_forecast.to_pickle('data_forecast_2018_04.pkl')

---

In [31]:
data_forecast = pd.read_pickle('data_forecast_2018_8AM_01.pkl')

# make index
data_forecast.index = data_forecast['time(predicted)']

data_forecast.index = pd.to_datetime(data_forecast.index).values
data_forecast = data_forecast.resample('15min').interpolate(method='quadratic')

# drop columns
data_forecast.drop(['time(predicted)', 'day', 'forecast'], axis=1, inplace=True)
data_forecast.reset_index(inplace=True)

# change columns
data_forecast.columns = ['time', 'temperature_3H', 'sky', 
                         'rainfall_probability', 'humidity', 'wind_speed', 'wind_direction']

data_forecast.tail()

Unnamed: 0,time,temperature_3H,sky,rainfall_probability,humidity,wind_speed,wind_direction
35024,2019-01-01 08:00:00,-1.482848,2.646518,16.465177,59.970882,9.80571,304.590926
35025,2019-01-01 08:15:00,-1.813653,2.514249,15.142493,61.069181,9.836068,304.467344
35026,2019-01-01 08:30:00,-2.17678,2.362407,13.624069,62.273468,9.861902,304.327662
35027,2019-01-01 08:45:00,-2.572229,2.19099,11.909905,63.583741,9.883213,304.171881
35028,2019-01-01 09:00:00,-3.0,2.0,10.0,65.0,9.9,304.0


In [32]:
# data_forecast.to_pickle('data_forecast_2018_8AM_02.pkl')

### insolartion

In [3]:
data_solar.head()

Unnamed: 0,time,경사일사량,수평일사량
0,2018-03-01 00:15:00,0,0
1,2018-03-01 00:30:00,0,0
2,2018-03-01 00:45:00,0,0
3,2018-03-01 01:00:00,0,0
4,2018-03-01 01:15:00,0,0


### weather(actual)

In [4]:
data_weather.head()

Unnamed: 0,index,기온(°C),강수량(mm),풍속(m/s),풍향(16방위),습도(%),현지기압(hPa),일조(hr),일사(MJ/m2),적설(cm),전운량(10분위)
0,2018-03-01 00:00:00,3.1,0.5,3.6,340,96.0,1001.3,,,,
1,2018-03-01 01:00:00,2.8,,0.7,140,97.0,1001.9,,,,
2,2018-03-01 02:00:00,2.6,,3.2,320,95.0,1002.6,,,,
3,2018-03-01 03:00:00,2.0,,1.9,230,97.0,1002.8,,,,
4,2018-03-01 04:00:00,2.2,,2.1,180,97.0,1003.0,,,,


In [5]:
data_weather.index = data_weather['index']
data_weather.drop(['index'], axis=1, inplace=True)
data_weather.reset_index(inplace=True)
data_weather = data_weather[['index', '전운량(10분위)']]

data_weather.columns = ['time', 'cloud']
data_weather.dropna(inplace=True)

### weather (forecast)

In [194]:
data_weather = pd.read_pickle('C:/Users/ewpadmin/JIHYUN/df_temp_0208.pkl')
data_weather.reset_index(inplace=True, drop=True)
data_weather.columns = ['time', 'temperature', 'rainfall', 'wind_direction', 'wind_speed', 'humidity', 
                        '현지기압(hPa)', '해면기압(hPa)', '일누적일사(MJ/m^2)', '일누적일조(Sec)']
data_weather.tail()

Unnamed: 0,time,temperature,rainfall,wind_direction,wind_speed,humidity,현지기압(hPa),해면기압(hPa),일누적일사(MJ/m^2),일누적일조(Sec)
558158,2018-12-31 23:55:00,-1.5,0.0,북풍,2.1,64.2,1032.5,1035.9,11.32,31920.0
558159,2018-12-31 23:56:00,-1.5,0.0,북풍,2.7,64.1,1032.6,1036.0,11.32,31920.0
558160,2018-12-31 23:57:00,-1.5,0.0,북풍,2.6,64.0,1032.6,1036.0,11.32,31920.0
558161,2018-12-31 23:58:00,-1.4,0.0,북풍,2.2,63.8,1032.6,1035.9,11.32,31920.0
558162,2018-12-31 23:59:00,-1.4,0.0,북풍,2.6,63.6,1032.6,1035.9,11.32,31920.0


In [7]:
method_name = ['nearest', 'zero', 'quadratic', 'cubic', 'spline', 'barycentric']

method_name_two = ['slinear', 'polynomial']

In [2]:
def make_dataset(method='linear', order=2, weather=2, AM=False):
    
    # make solar
    data_solar = pd.read_pickle('A_site_solar_Data_concat_02.pkl')
    
    # make actual(1min)
    if weather == 0:
        data_weather = pd.read_pickle('C:/Users/ewpadmin/JDY/df_temp_dropna_0208.pkl')

        # reset index
        data_weather.reset_index(inplace=True, drop=True)
        
        # drop feature - 해면기압, 일누적일사, 일누적일조
        data_weather.drop(['해면기압(hPa)', '일누적일사(MJ/m^2)', '일누적일조(Sec)'], axis=1, inplace=True)

        # change column name
        data_weather.columns = ['time', 'temperature', 'rainfall', 'wind_direction', 'wind_speed', 'humidity', 
                                'airpressure']

        # change wind_dirction standard
        imputer = {'북풍': 0, '북동풍': 1, '동풍': 2, '남동풍': 3, '남풍': 4, '남서풍': 5, '서풍': 6, '북서풍': 7}
        data_weather.wind_direction = data_weather.wind_direction.apply(lambda x : imputer[x])

        # merge dataset
        data_merge = pd.merge(data_solar, data_weather, on='time')
        
    # make actual (1H)
    elif weather == 1:
        # load dataset
        data_weather = pd.read_pickle('A_site_weather_Data_concat_02.pkl')
        data_weather.index = data_weather['index']

        # resampling
        if (method == 'slinear') or (method == 'polynomial'):
            data_weather = data_weather.resample('15min').interpolate(method=method, order=order)
        else:
            data_weather = data_weather.resample('15min').interpolate(method=method)

        data_weather.drop(['index'], axis=1, inplace=True)
        data_weather.reset_index(inplace=True)
        data_weather.dropna(inplace=True)
        
        # drop feature - 현지기압, 일조, 일사, 적설
        data_weather.drop(['현지기압(hPa)', '일조(hr)', '일사(MJ/m2)', '적설(cm)'], axis=1, inplace=True)
        
        # change columns
        data_weather.columns = ['time', 'temperature', 'rainfall', 'wind_speed', 'wind_direction', 'humidity', 'cloud']
        
        # merge dataset
        data_merge = pd.merge(data_solar, data_weather, on='time')
        
    # make forecast
    elif weather == 2:
        if AM:
            data_forecast = pd.read_pickle('data_forecast_2018_8AM_01.pkl')

            # make index
            data_forecast.index = data_forecast['time(predicted)']
            data_forecast.index = pd.to_datetime(data_forecast.index).values

            # resample
            if (method == 'slinear') or (method == 'polynomial'):
                data_forecast = data_forecast.resample('15min').interpolate(method=method, order=order)
            else:
                data_forecast = data_forecast.resample('15min').interpolate(method=method)

            # drop columns
            data_forecast.drop(['time(predicted)', 'day', 'forecast'], axis=1, inplace=True)
            data_forecast.reset_index(inplace=True)

            # change columns
            data_forecast.columns = ['time', 'temperature_3H', 'sky', 
                                     'rainfall_probability', 'humidity', 'wind_speed', 'wind_direction']
        else:
            # make forecast
            data_forecast = pd.read_csv('data_forecast_2018_02.csv')
            data_forecast.drop(['Unnamed: 0'], axis=1, inplace=True)

            # make index
            data_forecast.index = data_forecast['time(46H later)']
            data_forecast.index = pd.to_datetime(data_forecast.index).values

            # resample
            if (method == 'slinear') or (method == 'polynomial'):
                data_forecast = data_forecast.resample('15min').interpolate(method=method, order=order)
            else:
                data_forecast = data_forecast.resample('15min').interpolate(method=method)

            # drop columns
            data_forecast.drop(['time(prediction)', 'time(46H later)', 'precipitation_form'], axis=1, inplace=True)
            data_forecast.reset_index(inplace=True)

            # change columns
            data_forecast.columns = ['time', 'temperature_3H', 'sky', 
                                     'rainfall_probability', 'humidity', 'wind_speed', 'wind_direction']
        # merge dataset
        data_merge = pd.merge(data_solar, data_forecast, on='time')
    else:
        print("No Option.")
    
    print(data_merge.shape)
    return data_merge

def preprocessing_1(data_raw):

    # change dtypes
    data_raw['경사일사량'] = data_raw['경사일사량'].astype(np.float64)
    data_raw['수평일사량'] = data_raw['수평일사량'].astype(np.float64)
    data_raw.time = data_raw.time.astype(str)

    # make variable
    data_raw['date'] = data_raw.time.apply(lambda x : x.split(' ')[0])
    data_raw['exact_time'] = data_raw.time.apply(lambda x : x.split(' ')[1])

    data_raw['month'] = data_raw.date.apply(lambda x : x.split('-')[1])
    data_raw['day'] = data_raw.date.apply(lambda x : x.split('-')[2])
    data_raw['hour'] = data_raw.exact_time.apply(lambda x : x.split(':')[0])
    data_raw['minute'] = data_raw.exact_time.apply(lambda x : x.split(':')[1])

    # delete variable - time, date, exact_time
    data_raw.drop(['time', 'date', 'exact_time'], axis=1, inplace=True)

    # change dtypes
    data_raw.month = data_raw.month.astype(np.float64)
    data_raw.day = data_raw.day.astype(np.float64)
    data_raw.hour = data_raw.hour.astype(np.float64)
    data_raw.minute = data_raw.minute.astype(np.float64)

    # change columns name
    data_raw['insolation_vertical'] = data_raw['경사일사량']
    data_raw['insolation_horizontal'] = data_raw['수평일사량']
    data_raw.drop(['경사일사량', '수평일사량'], axis=1, inplace=True)
    
    return data_raw

# preprocessing II : make X, y // train, text // scaling
def preprocessing_2(data_raw, horizontal=True, scaling=True, test_size=0.25):
    
    # make X, y
    X = data_raw.drop(['insolation_vertical', 'insolation_horizontal'], axis=1)
    
    if horizontal:
        y = data_raw[['insolation_horizontal']]
    else:
        y = data_raw[['insolation_vertical']]
    
    # train, test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    
    # scaling
    mms = MinMaxScaler()
    X_sc = mms.fit_transform(X)
    y_sc = mms.fit_transform(y)
    
    X_train_sc = mms.fit_transform(X_train)
    X_test_sc = mms.transform(X_test)
    y_train_sc = mms.fit_transform(y_train)
    y_test_sc = mms.transform(y_test)    
          
    # reshape y_train, y_test
    n = y_train_sc.shape[0]
    y_train_sc = y_train_sc.reshape(n,)
    n = y_test_sc.shape[0]
    y_test_sc = y_test_sc.reshape(n,)

    return X_train_sc, X_test_sc, y_train_sc, y_test_sc, X_sc, y_sc

---

### make dataset(with preprocessing)

In [265]:
rmse_list = []
naive_list = []
cv_list = []
weather_names = ['actual(1min)', 'actual(1H)', 'forecast']

for n, weather_name in enumerate(weather_names):

    print(weather_name)
    # make dataset
    data_raw = make_dataset(method='zero', weather=n, AM=True)

    # preprocessing 1
    data_pre_1 = preprocessing_1(data_raw)

    # preprocessing 2
    X_train_sc, X_test_sc, y_train_sc, y_test_sc, X_sc, y_sc = preprocessing_2(data_pre_1, horizontal=False, 
                                                                               scaling=True, test_size=0.25)

    # modeling
    rf = RandomForestRegressor(n_estimators=15)
    model_rf = rf.fit(X_train_sc, y_train_sc)
    y_pred = model_rf.predict(X_test_sc)

    rmse = np.sqrt(np.mean(np.power(y_pred - y_test_sc, 2)))
    naive_r2 = r2_score(y_test_sc, y_pred)
    cv_r2 = np.mean(cross_val_score(rf, X_sc, y_sc, cv=4, scoring='r2'))

    rmse_list.append(rmse)
    naive_list.append(naive_r2)
    cv_list.append(cv_r2)
    
score_df = pd.DataFrame(columns=['weather_type', 'rmse', 'naive', 'cv'])
score_df['weather_type'] = weather_names
score_df['rmse'] = rmse_list
score_df['naive'] = naive_list
score_df['cv'] = cv_list

actual(1min)
(23783, 9)


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


actual(1H)
(1957, 9)


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


forecast
(23901, 9)


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


In [266]:
score_df

Unnamed: 0,weather_type,rmse,naive,cv
0,actual(1min),0.080044,0.935596,0.746384
1,actual(1H),0.05027,0.969831,0.620725
2,forecast,0.069549,0.952054,0.743073


- 46H later : 0.8111751627453345
- 8AM : 0.7862941886005279

### 노트

- forecast 데이터를 사용하는게 괜찮을 듯

In [4]:
# make dataset
data_raw = make_dataset(method='linear', weather=2, AM=True)

# preprocessing 1
data_pre_1 = preprocessing_1(data_raw)

# preprocessing 2
X_train_sc, X_test_sc, y_train_sc, y_test_sc, X_sc, y_sc = preprocessing_2(data_pre_1, horizontal=False, 
                                                                           scaling=True, test_size=0.25)

# modeling
rf = RandomForestRegressor(n_estimators=15)
model_rf = rf.fit(X_train_sc, y_train_sc)
y_pred = model_rf.predict(X_test_sc)

rmse = np.sqrt(np.mean(np.power(y_pred - y_test_sc, 2)))
naive_r2 = r2_score(y_test_sc, y_pred)
cv_r2 = np.mean(cross_val_score(rf, X_sc, y_sc, cv=4, scoring='r2'))

rmse, naive_r2, cv_r2

(23901, 9)


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


(0.07379881858409411, 0.9464629683453728, 0.7460243661437591)

---

### model test

### statsmodels

In [234]:
# modeling
model_sm = sm.OLS(y_train_sc, X_train_sc)
result = model_sm.fit()
y_pred = result.predict(X_test_sc)

mse = mean_squared_error(y_test_sc, y_pred)
r2_scoring = r2_score(y_test_sc, y_pred)

print(mse, r2_scoring)
print(result.summary())

0.049092160329876657 0.35663215628743805
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.560
Model:                            OLS   Adj. R-squared:                  0.557
Method:                 Least Squares   F-statistic:                     206.1
Date:                Tue, 12 Feb 2019   Prob (F-statistic):          1.61e-252
Time:                        15:18:47   Log-Likelihood:                 69.001
No. Observations:                1467   AIC:                            -120.0
Df Residuals:                    1458   BIC:                            -72.38
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1         

### sklearn linear regression

In [233]:
lr = LinearRegression()
model_lr = lr.fit(X_train_sc, y_train_sc)
y_pred = model_lr.predict(X_test_sc)

mse = mean_squared_error(y_test_sc, y_pred)
naive_r2 = r2_score(y_test_sc, y_pred)
cv_r2 = np.mean(cross_val_score(lr, X_sc, y_sc, cv=4, scoring='r2'))

mse, naive_r2, cv_r2

(0.04689199016310876, 0.38546606228144076, 0.27815723796225655)

### SVM

In [232]:
model_svm = SVR(C=100, kernel='rbf')
result = model_svm.fit(X_train_sc, y_train_sc)
y_pred = result.predict(X_test_sc)

mse = mean_squared_error(y_test_sc, y_pred)
naive_r2 = r2_score(y_test_sc, y_pred)
cv_r2 = np.mean(cross_val_score(model_svm, X_sc, y_sc, cv=4, scoring='r2'))

mse, naive_r2, cv_r2

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(0.023372145804730456, 0.6937008486022189, 0.4045454585861782)

### random forest

In [236]:
rf = RandomForestRegressor(n_estimators=15)
model_rf = rf.fit(X_train_sc, y_train_sc)
y_pred = model_rf.predict(X_test_sc)

mse = mean_squared_error(y_test_sc, y_pred)
naive_r2 = r2_score(y_test_sc, y_pred)
cv_r2 = np.mean(cross_val_score(rf, X_sc, y_sc, cv=4, scoring='r2'))

mse, naive_r2, cv_r2

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


(0.0045063631851144014, 0.9409427259686454, 0.6333709500063072)

### lightgbm

In [237]:
import lightgbm as lgb

In [242]:
# create dataset for lightgbm

lgb_train = lgb.Dataset(X_train_sc, y_train_sc)
lgb_test = lgb.Dataset(X_test_sc, y_test_sc, reference=lgb_train)

params = {
    'objective': 'regression',
    'boosting_type': 'gbdt',
    'verbose': 0
}

gbm = lgb.train(params, lgb_train, valid_sets=lgb_test)
y_pred = gbm.predict(X_test_sc)

mse = mean_squared_error(y_test_sc, y_pred), 
naive_r2 = r2_score(y_test_sc, y_pred)
cv_r2 = np.mean(cross_val_score(lr, X_sc, y_sc, cv=4, scoring='r2'))

mse, naive_r2, cv_r2

[1]	valid_0's l2: 0.0642625
[2]	valid_0's l2: 0.0543292
[3]	valid_0's l2: 0.0458051
[4]	valid_0's l2: 0.0388702
[5]	valid_0's l2: 0.0337894
[6]	valid_0's l2: 0.029127
[7]	valid_0's l2: 0.0251221
[8]	valid_0's l2: 0.0218788
[9]	valid_0's l2: 0.0194337
[10]	valid_0's l2: 0.017275
[11]	valid_0's l2: 0.0154477
[12]	valid_0's l2: 0.0139613
[13]	valid_0's l2: 0.0127755
[14]	valid_0's l2: 0.0117299
[15]	valid_0's l2: 0.0109257
[16]	valid_0's l2: 0.0103322
[17]	valid_0's l2: 0.00958886
[18]	valid_0's l2: 0.00898088
[19]	valid_0's l2: 0.00852411
[20]	valid_0's l2: 0.00826378
[21]	valid_0's l2: 0.00792117
[22]	valid_0's l2: 0.0076983
[23]	valid_0's l2: 0.00738118
[24]	valid_0's l2: 0.00721847
[25]	valid_0's l2: 0.00697752
[26]	valid_0's l2: 0.0068503
[27]	valid_0's l2: 0.00666516
[28]	valid_0's l2: 0.00648336
[29]	valid_0's l2: 0.00641326
[30]	valid_0's l2: 0.00624224
[31]	valid_0's l2: 0.0061289
[32]	valid_0's l2: 0.00603678
[33]	valid_0's l2: 0.00594862
[34]	valid_0's l2: 0.00584692
[35]	valid

((0.004251444908539622,), 0.9442835083017265, 0.27815723796225655)

---

### parameter test

In [271]:
method_name_v1 = ['linear', 'nearest', 'zero', 'quadratic', 'cubic']
method_name_v2 = ['slinear', 'polynomial']
order_name = [2, 3, 5]

mse_list = []
naive_list = []
cv_list = []

method = method_name_v2[1]
for method in method_name_v1:
    print(method)

    # make dataset
    data_raw = make_dataset(method=method, order=order, weather=2, AM=True)

    # preprocessing 1
    data_pre_1 = preprocessing_1(data_raw)

    # preprocessing 2
    X_train_sc, X_test_sc, y_train_sc, y_test_sc, X_sc, y_sc = preprocessing_2(data_pre_1, horizontal=True, scaling=True)

    # modeling
    rf = RandomForestRegressor(n_estimators=15)
    model_rf = rf.fit(X_train_sc, y_train_sc)
    y_pred = model_rf.predict(X_test_sc)

    # evaluate
    mse = mean_squared_error(y_test_sc, y_pred)
    naive_r2 = r2_score(y_test_sc, y_pred)
    cv_r2 = np.mean(cross_val_score(rf, X_sc, y_sc, cv=5, scoring='r2'))

    mse_list.append(mse)
    naive_list.append(naive_r2)
    cv_list.append(cv_r2)

linear
(23901, 9)


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


nearest
(23901, 9)


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


zero
(23901, 9)


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


quadratic
(23901, 9)


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


cubic
(23901, 9)


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


### 7. weather=2(forecast), AM=True, method=...

In [272]:
# cloud=True, AM=True
result_df = pd.DataFrame(columns=['method', 'mse', 'naive', 'cv'])
result_df['method'] = method_name
result_df['mse'] = mse_list
result_df['naive'] = naive_list
result_df['cv'] = cv_list
result_df.sort_values(by='cv', ascending=False)

Unnamed: 0,method,mse,naive,cv
0,linear,0.002771,0.964443,0.714722
4,cubic,0.003111,0.960399,0.704051
2,zero,0.002798,0.963868,0.698583
3,quadratic,0.003315,0.957823,0.698312
1,nearest,0.003117,0.960848,0.640071


### 6

In [176]:
# cloud=False, AM=True, method='slinear', order=2,3,4,5
result_df = pd.DataFrame(columns=['order', 'mse', 'naive', 'cv'])
result_df['order'] = order_name
result_df['mse'] = mse_list
result_df['naive'] = naive_list
result_df['cv'] = cv_list
result_df.sort_values(by='cv', ascending=False)

Unnamed: 0,order,mse,naive,cv
2,5,0.002157,0.969735,0.721687
1,3,0.002114,0.970262,0.714512
0,2,0.002043,0.97083,0.695171


### 5

In [173]:
# cloud=False, AM=True, method='slinear', order=2,3,4,5
result_df = pd.DataFrame(columns=['order', 'mse', 'naive', 'cv'])
result_df['order'] = order_name
result_df['mse'] = mse_list
result_df['naive'] = naive_list
result_df['cv'] = cv_list
result_df.sort_values(by='cv', ascending=False)

Unnamed: 0,order,mse,naive,cv
1,3,0.002078,0.969591,0.711728
0,2,0.001989,0.971519,0.709254
2,4,0.001946,0.971752,0.700308
3,5,0.002158,0.968713,0.693984


### 4

In [171]:
# cloud=True, AM=True
result_df = pd.DataFrame(columns=['method', 'mse', 'naive', 'cv'])
result_df['method'] = method_name
result_df['mse'] = mse_list
result_df['naive'] = naive_list
result_df['cv'] = cv_list
result_df.sort_values(by='cv', ascending=False)

Unnamed: 0,method,mse,naive,cv
4,cubic,0.002472,0.962665,0.724744
0,linear,0.001912,0.972991,0.70509
3,quadratic,0.002503,0.963701,0.702769
2,zero,0.001973,0.970144,0.69173
1,nearest,0.002416,0.965722,0.689889


### 3

In [166]:
# cloud=False, AM=True
result_df = pd.DataFrame(columns=['method', 'naive', 'cv'])
result_df['method'] = method_name
result_df['naive'] = naive_list
result_df['cv'] = cv_list
result_df.sort_values(by='cv', ascending=False)

Unnamed: 0,method,naive,cv
2,zero,0.96575,0.727507
0,linear,0.964111,0.715339
3,quadratic,0.958566,0.700682
4,cubic,0.965389,0.672776
1,nearest,0.963836,0.669648


### 2

In [161]:
# cloud=True, AM=False
result_df = pd.DataFrame(columns=['method', 'naive', 'cv'])
result_df['method'] = method_name
result_df['naive'] = naive_list
result_df['cv'] = cv_list
result_df.sort_values(by='cv', ascending=False)

Unnamed: 0,method,naive,cv
2,zero,0.962399,0.698306
1,nearest,0.957911,0.664694
0,linear,0.960857,0.664501
3,quadratic,0.956691,0.642589
4,cubic,0.957604,0.642309


### 1

In [159]:
# cloud=False, AM=False
result_df = pd.DataFrame(columns=['method', 'naive', 'cv'])
result_df['method'] = method_name
result_df['naive'] = naive_list
result_df['cv'] = cv_list
result_df.sort_values(by='cv', ascending=False)

Unnamed: 0,method,naive,cv
0,linear,0.966972,0.690853
3,quadratic,0.963849,0.668595
1,nearest,0.971346,0.662977
2,zero,0.971209,0.651262
4,cubic,0.966875,0.646213


### note

- 성능이 높지 않게 나온다.
- interpolate method=zero, cloud=False(기상실측데이터 - 운량 사용x), AM=True(오전 8시 데이터)가 높게 나온다.