In [150]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import platform

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split

import seaborn as sns
from sklearn.model_selection import GridSearchCV

# 운영체제별 한글 폰트 설정
if platform.system() == 'Windows': # Windows 환경 폰트 설정
    plt.rc('font', family='Malgun Gothic')

plt.rc('axes', unicode_minus=False) # 마이너스 폰트 설정


# 글씨 선명하게 출력하는 설정
%config InlineBackend.figure_format = 'retina'

# Warning 제거
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) 
warnings.simplefilter(action='ignore', category=UserWarning)  
pd.set_option('mode.chained_assignment',  None)   
#pd.set_option('mode.chained_assignment', 'warn')

In [151]:
sample_df = pd.read_csv(r'data\sample_submission.csv')
test_df = pd.read_csv(r'data\test.csv')
train_df = pd.read_csv(r'data\train.csv')

In [152]:
Y = train_df['착과량(int)']
X = train_df.drop(['ID','착과량(int)'],axis = 1)
test = test_df.drop('ID',axis=1)

In [153]:
cols=[]
for i in X.columns:
    if '엽록소' not in i:
        cols.append(i)
x = X[cols]
test = test_df[cols]

In [156]:
x_tr, x_te, y_tr, y_te = train_test_split(x,Y, test_size = 0.3, random_state=3)

params ={
    'n_estimators':[100,110,120,130],
    'max_depth':[4,6,8,10],
    'min_samples_leaf':[8,12,18],
    'min_samples_split':[8,12,18],
    'learning_rate' : [0.05,0.1,0.15]
}

gbm = GradientBoostingRegressor()
grid_cv = GridSearchCV(gbm, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(x_tr,y_tr)

print(f"Best Param: {grid_cv.best_params_}")
depth = grid_cv.best_params_['max_depth']
leaf = grid_cv.best_params_['min_samples_leaf']
split = grid_cv.best_params_['min_samples_split']
estimators = grid_cv.best_params_['n_estimators']
learning = grid_cv.best_params_['learning_rate']

model = GradientBoostingRegressor(max_depth=depth,min_samples_leaf=leaf,min_samples_split=split,n_estimators=estimators,learning_rate=learning)

model.fit(x_tr, y_tr)

train_accuracy = model.score(x_tr, y_tr)
test_accuracy = model.score(x_te, y_te)

print(f'훈련 정확도 : {train_accuracy}\ntest 정확도 : {test_accuracy}')


Best Param: {'learning_rate': 0.05, 'max_depth': 4, 'min_samples_leaf': 8, 'min_samples_split': 12, 'n_estimators': 100}
훈련 정확도 : 0.978832825102613
test 정확도 : 0.9702893099488676


In [157]:
model = GradientBoostingRegressor(max_depth=depth,min_samples_leaf=leaf,min_samples_split=split,n_estimators=estimators)
model.fit(x, Y)

train_accuracy = model.score(x, Y)
print(f'훈련 정확도는 : {train_accuracy} 입니다.')

훈련 정확도는 : 0.9803726889294049 입니다.


In [159]:
test_pred = model.predict(test)
sample_df['착과량(int)'] = test_pred.astype(int)
sample_df.to_csv('result.csv',index=False)