In [None]:
import numpy as np
import pandas as pd
import pandas_datareader.data as pdr
import matplotlib.pyplot as plt

import datetime

from sklearn.metrics import mean_absolute_error

from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold

import lightgbm as lgb

import copy
import warnings
warnings.simplefilter('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
train_df=pd.read_csv('/content/drive/MyDrive/energy/train.csv',encoding='cp949')
test_df=pd.read_csv('/content/drive/MyDrive/energy/test.csv',encoding='cp949')
submission_df=pd.read_csv('/content/drive/MyDrive/energy/sample_submission.csv',encoding='cp949')

In [None]:
train_df.dtypes

In [None]:
train_df.isna().sum()

In [None]:
test_df.isna().sum()

In [None]:
temp_df = train_df[['num', '비전기냉방설비운영', '태양광보유']]
ice={}
hot={}
count=0
for i in range(0, len(train_df), len(train_df)//60):
    count += 1
    ice[count] = train_df.loc[i, '비전기냉방설비운영']
    hot[count] = train_df.loc[i, '태양광보유']

In [None]:
test_EDA = copy.deepcopy(test_df)

In [None]:
for i in range(len(test_df)):
    test_EDA.loc[i, '비전기냉방설비운영']=ice[test_df['num'][i]]
    test_EDA.loc[i, '태양광보유']=hot[test_df['num'][i]]

In [None]:
test_EDA= test_EDA.interpolate(method='values')

In [None]:
def change_date(date_one_time):
    dateformat = "%Y-%m-%d %H"
    convert_one_time = datetime.datetime.strptime(date_one_time, dateformat)
    return convert_one_time

def insert_dateindex(df):
    date_time=df['date_time']
    convert_time=date_time.map(change_date)
    date_index=pd.DatetimeIndex(convert_time)
    df.index=date_index
    return df

def change_date_df(df):
    date_time=df['date_time']
    convert_time=date_time.map(change_date)
    df['date_time']=convert_time
    return df

In [None]:
test_EDA

In [None]:
train_EDA=copy.deepcopy(train_df)

In [None]:
def time(x):
    return int(x[-2:])

train_EDA['time'] = train_EDA['date_time'].apply(lambda x: time(x))
test_EDA['time'] = test_EDA['date_time'].apply(lambda x: time(x))

def weekday(x):
    return pd.to_datetime(x[:10]).weekday()

train_EDA['weekday'] = train_EDA['date_time'].apply(lambda x: weekday(x))
test_EDA['weekday'] = test_EDA['date_time'].apply(lambda x: weekday(x))

In [None]:
train_EDA.weekday.unique()

In [None]:
train_EDA=insert_dateindex(train_EDA)

In [None]:
train_EDA

In [None]:
train_EDA[train_EDA['num']==1].loc[:,'전력사용량(kWh)'].plot()
plt.show()

In [None]:
n=60
for i in range(n):
    plt.figure(i,figsize=(10,4))
    train_EDA[train_EDA['num']==i+1].loc[:,'전력사용량(kWh)'].plot()
    plt.title(f"num {i}")

plt.show()

In [None]:
train_EDA.columns.tolist()

In [None]:
drop_list=['date_time']
new_train=train_EDA.drop(drop_list,axis=1)

In [None]:
new_train

In [None]:
train_x = new_train.drop('전력사용량(kWh)', axis=1)
train_y = new_train[['전력사용량(kWh)']]

In [None]:
from sklearn.metrics import r2_score

In [None]:
from matplotlib import font_manager, rc
#font_path = "C:/Windows/Fonts/NanumPen.ttf"
font_path = "/content/drive/MyDrive/energy/NanumPen.ttf"
font = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font)


In [None]:
def Simple_lightGBM(x,y):
    X_train, X_valid, y_train, y_valid = train_test_split(x, y, test_size=0.25, random_state=21)

    eval_set = [(X_train, y_train), (X_valid, y_valid)]
    lgbm_model=LGBMRegressor(n_estimators=1000,max_depth=20,)

    lgbm_model.fit(X_train, y_train, eval_set=[(X_train,y_train), (X_valid,y_valid)], early_stopping_rounds=30, verbose=100)

    lgb.plot_importance(lgbm_model)

In [None]:
import matplotlib 
matplotlib.font_manager._rebuild()

Simple_lightGBM(train_x, train_y)

In [None]:
def run_model(train_x,train_y):
  cross=KFold(n_splits=5, shuffle=True, random_state=42)
  folds=[]
  for train_idx, valid_idx in cross.split(train_x, train_y):
      folds.append((train_idx, valid_idx))
    
  models={}
  for fold in range(5):
      print(f'===================={fold+1}=======================')
      train_idx, valid_idx=folds[fold]
      X_train=train_x.iloc[train_idx, :]
      y_train=train_y.iloc[train_idx, :]
      X_valid=train_x.iloc[valid_idx, :]
      y_valid=train_y.iloc[valid_idx, :]
      
      model=LGBMRegressor(n_estimators=100)
      model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], 
              early_stopping_rounds=30, verbose=100)
      models[fold]=model
      
      print(f'================================================\n\n')

  return models

In [None]:
models=run_model(train_x,train_y)

In [None]:
train_EDA

In [None]:
drop_list=['date_time','강수량(mm)','일조(hr)','비전기냉방설비운영','태양광보유']
new_train=train_EDA.drop(drop_list,axis=1)
train_x=new_train.drop('전력사용량(kWh)', axis=1)
train_y=new_train[['전력사용량(kWh)']]

In [None]:
models2=run_model(train_x,train_y)

In [None]:
drop_list=['date_time','강수량(mm, 6시간)','일조(hr, 3시간)','비전기냉방설비운영','태양광보유']
new_test=test_EDA.drop(drop_list,axis=1)

In [None]:
submission_1 = copy.deepcopy(submission_df)

In [None]:
for i in range(5):
    submission_1['answer'] += models2[i].predict(new_test)/5

In [None]:
submission_1.to_csv('lgbm_submission1.csv', index=False) # score : 	18.2238002804

In [None]:
!pip install optuna

In [None]:
import optuna
def objective(trial):
    # data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
    x_train, test_x, y_train, test_y = train_test_split(train_x, train_y, test_size=0.25)
    dtrain = lgb.Dataset(x_train, label=y_train)
 
    param = {
        #objective': 'binary',
        'objective': 'regression',
        'metric': "mape",
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }
 
    gbm = lgb.train(param, dtrain)
    preds = gbm.predict(test_x)
    pred_labels = np.rint(preds)
    accuracy = r2_score(test_y, pred_labels)
    return accuracy
 
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
 
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
def new_model(train_x,train_y):
  cross=KFold(n_splits=5, shuffle=True, random_state=42)
  lgbm_params={'lambda_l1': 0.0004618352643903076, 'lambda_l2': 0.0025320311842192953, 'num_leaves': 231, 'feature_fraction': 0.9663031052460144, 'bagging_fraction': 0.9298440283087137, 'bagging_freq': 4, 'min_child_samples': 19}

  folds=[]
  for train_idx, valid_idx in cross.split(train_x, train_y):
      folds.append((train_idx, valid_idx))
    
  models={}
  for fold in range(5):
      print(f'===================={fold+1}=======================')
      train_idx, valid_idx=folds[fold]
      X_train=train_x.iloc[train_idx, :]
      y_train=train_y.iloc[train_idx, :]
      X_valid=train_x.iloc[valid_idx, :]
      y_valid=train_y.iloc[valid_idx, :]
      
      model=LGBMRegressor(**lgbm_params)
      model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], 
              early_stopping_rounds=30, verbose=100)
      models[fold]=model
      
      print(f'================================================\n\n')

  return models

In [None]:

drop_list=['date_time','강수량(mm)','일조(hr)','비전기냉방설비운영','태양광보유']
new_train=train_EDA.drop(drop_list,axis=1)
#x_drop_list=['전력사용량(kWh)']
train_x=new_train.drop('전력사용량(kWh)', axis=1)
train_y=new_train[['전력사용량(kWh)']]

In [None]:
models3=new_model(train_x,train_y)

In [None]:
submission_2=copy.deepcopy(submission_df)
for i in range(5):
    submission_2['answer'] += models3[i].predict(new_test)/5 

submission_2.to_csv('optuna_lgbm_submission1.csv', index=False) 
#score 	9.4211862456