In [None]:
# ! pip install tensorflow
# ! pip install xgboost
# ! pip install lightgbm


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

In [None]:
tourlist=pd.read_excel('dataset_clean.xlsx')
tourlist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205381 entries, 0 to 205380
Data columns (total 12 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   관광지명       205381 non-null  object        
 1   검색건수       205381 non-null  int64         
 2   area_code  205381 non-null  int64         
 3   date       205381 non-null  object        
 4   avgTemp    205381 non-null  float64       
 5   highTemp   205381 non-null  float64       
 6   lowTemp    205381 non-null  float64       
 7   weather    205381 non-null  int64         
 8   rainProb   205381 non-null  int64         
 9   datetime   205381 non-null  datetime64[ns]
 10  month      205381 non-null  object        
 11  weekday    205381 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(5), object(3)
memory usage: 18.8+ MB


In [None]:
#범주형 데이터 라벨인코딩
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(tourlist['관광지명'])
tourlist['관광지명'] = encoder.transform(tourlist['관광지명'])
encoder.fit(tourlist['weather'])
tourlist['weather'] = encoder.transform(tourlist['weather'])

In [None]:
# 관광지명, 날씨, 평균온도, 최고온도, 최저온도, 강수확률
train = pd.concat([tourlist['관광지명'], tourlist['weather'], tourlist[['avgTemp','highTemp','lowTemp','rainProb']]], axis=1)
target = tourlist['검색건수']
train_input,test_input,train_target,test_target = train_test_split(
    train,target, test_size = 0.3)

In [None]:
# 라벨인코딩_랜덤포레스트
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
scores = cross_validate(model, train_input, train_target, return_train_score=True, n_jobs=-1)
print(np.mean(scores['train_score']),np.mean(scores['test_score']))

0.9817491928787285 0.8750109321038908


In [None]:
# 라벨인코딩_xgb
from xgboost import XGBRegressor
model = XGBRegressor(tree_method = 'hist')
scores = cross_validate(model, train_input, train_target, return_train_score=True, n_jobs=-1)
print(np.mean(scores['train_score']),np.mean(scores['test_score']))

0.5402500925265556 0.4104408053741991


In [None]:
# 라벨인코딩_lgbm
from lightgbm import LGBMRegressor
model = LGBMRegressor(tree_method = 'hist')
scores = cross_validate(model, train_input, train_target, return_train_score=True, n_jobs=-1)
print(np.mean(scores['train_score']),np.mean(scores['test_score']))

0.4891478886624728 0.4368813715455852


In [None]:
tourlist=pd.read_excel('dataset_clean.xlsx')

In [None]:
#범주형 데이터 원핫인코딩
from sklearn.preprocessing import LabelEncoder
tourlist_oh = pd.get_dummies(tourlist['관광지명'])
weather_oh = pd.get_dummies(tourlist['weather'])

In [None]:
# 관광지명, 날씨, 평균온도, 최고온도, 최저온도, 강수확률
train = pd.concat([tourlist_oh, weather_oh, tourlist[['avgTemp','highTemp','lowTemp','rainProb']]], axis=1)
target = tourlist['검색건수']
train_input,test_input,train_target,test_target = train_test_split(
    train,target, test_size = 0.3)

In [None]:
# 원핫인코딩_랜덤포레스트 >>자원활용 문제
# from sklearn.ensemble import RandomForestRegressor
# model = RandomForestRegressor()
# scores = cross_validate(model, train_input, train_target, return_train_score=True, n_jobs=-1)
# print(np.mean(scores['train_score']),np.mean(scores['test_score']))

In [None]:
# 원핫인코딩_xgb
from xgboost import XGBRegressor
model = XGBRegressor(tree_method = 'hist')
scores = cross_validate(model, train_input, train_target, return_train_score=True, n_jobs=-1)
print(np.mean(scores['train_score']),np.mean(scores['test_score']))

0.9034450052242253 0.8645478301099976


In [None]:
# 원핫인코딩_lgbm >>Do not support special JSON characters in feature name 에러
# from lightgbm import LGBMRegressor
# model = LGBMRegressor(tree_method = 'hist')
# scores = cross_validate(model, train_input, train_target, return_train_score=True, n_jobs=-1)
# print(np.mean(scores['train_score']),np.mean(scores['test_score']))

In [None]:
# 정규화
from sklearn.preprocessing import MinMaxScaler
tourlist4sc = tourlist[['avgTemp','highTemp','lowTemp','rainProb']]
scaler = MinMaxScaler()
scaler.fit(tourlist4sc)
tourlist_sc = scaler.transform(tourlist4sc)
tourlist_sc = pd.DataFrame(tourlist_sc, columns=['avgTemp','highTemp','lowTemp','rainProb'])

In [None]:
# 관광지명, 날씨, 평균온도, 최고온도, 최저온도, 강수확률
train = pd.concat([tourlist_oh, weather_oh, tourlist_sc], axis=1)
target = tourlist['검색건수']
train_input,test_input,train_target,test_target = train_test_split(
    train,target, test_size = 0.3)

In [None]:
# 원핫인코딩_정규화_xgb
from xgboost import XGBRegressor
model = XGBRegressor(tree_method = 'hist')
scores = cross_validate(model, train_input, train_target, return_train_score=True, n_jobs=-1)
print(np.mean(scores['train_score']),np.mean(scores['test_score']))

0.9025527725218716 0.8644964489579223


In [None]:
# 표준화
from sklearn.preprocessing import StandardScaler
tourlist4sc = tourlist[['avgTemp','highTemp','lowTemp','rainProb']]
scaler = StandardScaler()
scaler.fit(tourlist4sc)
tourlist_sc = scaler.transform(tourlist4sc)
tourlist_sc = pd.DataFrame(tourlist_sc, columns=['avgTemp','highTemp','lowTemp','rainProb'])

In [None]:
# 관광지명, 날씨, 평균온도, 최고온도, 최저온도, 강수확률
train = pd.concat([tourlist_oh, weather_oh, tourlist_sc], axis=1)
target = tourlist['검색건수']
train_input,test_input,train_target,test_target = train_test_split(
    train,target, test_size = 0.3)

In [None]:
# 원핫인코딩_표준화_xgb
from xgboost import XGBRegressor
model = XGBRegressor(tree_method = 'hist')
scores = cross_validate(model, train_input, train_target, return_train_score=True, n_jobs=-1)
print(np.mean(scores['train_score']),np.mean(scores['test_score']))

0.9033847594710351 0.8636149740801968
