# 2주차 과제 제출에 포함되어야 하는 형태 (최소)
1. 모델링에 대한 결과 .ipynb 파일 (team1_onboarding_week2_analysis.ipynb)
2. np.random.seed(42) 설정
3. Input : X데이터 : [user_id, +@]
4. Output : Count column 예측값
5. 2020.01.01~2020.09.30 : Training Data
6. 2020.10.01~2020.12.31 : Test Data
7. 2020.10.01~2020.12.31 구간에 대한 MSE, MAE 계산값
8. 요일별 이용량에 대한 분석 - EDA
9. 유저별 이용 count 값에 대한 분석 - EDA

In [16]:
import os 
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Malgun Gothic'  # (Windows 용) 한글 출력을 위한 글꼴 설정

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")

# 랜덤시드 통일
np.random.seed(42)

In [17]:
def get_font_family():
    """
    시스템 환경에 따른 기본 폰트명을 반환하는 함수
    """
    import platform
    system_name = platform.system()
    # colab 사용자는 system_name이 'Linux'로 확인

    if system_name == "Darwin" :
        font_family = "AppleGothic"
    elif system_name == "Windows":
        font_family = "Malgun Gothic"
    else:
        # Linux
        # colab에서는 runtime을 <꼭> 재시작 해야함.
        # 런타임을 재시작 하지 않고 폰트 설치를 하면 기본 설정 폰트가 로드되어 한글이 깨짐.
        !apt-get update -qq
        !apt-get install fonts-nanum -qq  > /dev/null

        import matplotlib.font_manager as fm

        fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
        font = fm.FontProperties(fname=fontpath, size=9)
        fm._rebuild()
        font_family = "NanumBarunGothic"
    return font_family

In [18]:
# 시각화를 위한 폰트설정
# 위에서 만든 함수를 통해 시스템 폰트를 불러와서 font_family 라는 변수에 할당.
a = get_font_family()
# 폰트설정
import matplotlib.pyplot as plt 
plt.rc("font", family = a)
# 마이너스폰트 설정
plt.rc("axes", unicode_minus=False)
# ggplot으로 그래프 스타일 설정 / 개인 자유
plt.style.use("ggplot")

In [19]:
# 데이터 확인
os.listdir('data')

['2020교통량통합.xlsx',
 'check.csv',
 'final.csv',
 'holiday.csv',
 'metro.csv',
 '교통량정리.csv',
 '국가공휴일.xlsx',
 '기상청.csv',
 '디지털 스킬셋 기술과제.docx',
 '서울시_기상데이터.csv',
 '실전db.csv',
 '실전db_holiday.csv',
 '지하철노선위경도정보3.xlsx',
 '최종.csv']

In [20]:
# 데이터프레임 불러오기
df = pd.read_csv("./data/최종.csv")
df = df.drop("Unnamed: 0",axis=1)
df = df.sort_values(by="DATE")

In [21]:
X_train = df[:623305]
X_test = df[623305:]

In [22]:
y_train = X_train['COUNT']
X_train = X_train.drop(["DATE", "COUNT"], axis=1)


In [23]:
real_count = X_test['COUNT']
X_test = X_test.drop(["DATE", "COUNT"], axis=1)

In [24]:
# 유효성 검사 데이터 셋 

random_state = 42
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train,
                                                                         random_state=random_state,
                                                                         test_size=0.2)


print("X_train_split : ", X_train_split.shape)
print("X_val_split : ", X_val_split.shape)
print("y_train_split : ", y_train_split.shape)
print("y_val_split : ", y_val_split.shape)

X_train_split :  (498644, 19)
X_val_split :  (124661, 19)
y_train_split :  (498644,)
y_val_split :  (124661,)


In [25]:
def cnt_predict(model, X_train, y_train, kind="default", print_len=7):
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train,
                                                                         random_state=random_state,
                                                                         test_size=0.2)
    model.fit(X_train_split, y_train_split)
    y_pred_train = model.predict(X_train_split)
    model_name = model.__class__.__name__
    print("#### " +model_name+" month prediction("+kind+") ####")
    print("y_train[:{}] : {}".format(print_len, np.round(y_train_split[:print_len].values,1)))
    print("y_pred_train[:{}] : {}".format(print_len, np.round(y_pred_train[:print_len],1)))
    print()
    y_pred_val = model.predict(X_val_split)
    print("y_val[:{}] : {}".format(print_len, np.round(y_val_split[:print_len].values,1)))
    print("y_pred_val[:{}] : {}".format(print_len, np.round(y_pred_val[:print_len],1)))
    rmse = np.sqrt(mean_squared_error(y_pred_val, y_val_split))
    print("rmse : {:.6f}".format(rmse))
    print("----------\n")
    return model

In [26]:
# 모델 평가
def average_model_evaluation(models, X_train, y_train, kind="default", print_len=7):
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train.values, y_train.values,
                                                                         random_state=random_state,
                                                                         test_size=0.2)
    pred_vals = []
    print("####### average model #######\n\n")
    for model in models:
        model.fit(X_train_split, y_train_split)
        y_pred_train = model.predict(X_train_split)
        model_name = model.__class__.__name__
        print("#### " +model_name+"  prediction("+kind+") ####")
        print("y_train[:{}] : {}".format(print_len, np.round(y_train_split[:print_len],1)))
        print("y_pred_train[:{}] : {}".format(print_len, np.round(y_pred_train[:print_len],1)))
        print()
        y_pred_val = model.predict(X_val_split)
        pred_vals.append(y_pred_val)
        print("y_val[:{}] : {}".format(print_len, np.round(y_val_split[:print_len],1)))
        print("y_pred_val[:{}] : {}".format(print_len, np.round(y_pred_val[:print_len],1)))
        rmse = np.sqrt(mean_squared_error(y_pred_val, y_val_split))
        print("rmse : {:.6f}".format(rmse))
        print("----------\n")
    print("seperated model evaluation ended\n")
    print("### average model evaluation ###")
    y_pred = np.mean(pred_vals, axis=0)
    print("average model validation predictions : ", y_pred)
    rmse = np.sqrt(mean_squared_error(y_pred, y_val_split))
    print("average model rmse : {:.6f}".format(rmse))
    print("-------------------------------")
    return models, pred_vals

# 모델 예측 평균
def average_model_prediction(models, X_test, kind="default", print_len=20):
    print()
    pred_tests = []
    print("####### average model #######\n\n")
    for model in models:
        model_name = model.__class__.__name__
        print("#### " +model_name+" prediction("+kind+") ####")
        print(X_test.shape)
        y_pred_test = model.predict(X_test.values)
        print("y_pred_test[:{}] : {}".format(print_len, np.round(y_pred_test[:print_len],1)))
        pred_tests.append(y_pred_test)

    y_pred = np.mean(pred_tests, axis=0)
    return y_pred

In [27]:
rf_regressor = RandomForestRegressor(random_state=random_state)
lgbm_regressor = LGBMRegressor(random_state=random_state)
xgb_regressor = XGBRegressor(random_state=random_state)
gb_regressor = GradientBoostingRegressor(random_state=random_state)

default_models = [rf_regressor, lgbm_regressor, xgb_regressor, gb_regressor]
default_models, pred_vals = average_model_evaluation(default_models, X_train, y_train)

####### average model #######


#### RandomForestRegressor  prediction(default) ####
y_train[:7] : [1 1 1 1 1 1 1]
y_pred_train[:7] : [1. 1. 1. 1. 1. 1. 1.]

y_val[:7] : [1 1 1 1 1 1 1]
y_pred_val[:7] : [1. 1. 1. 1. 1. 1. 1.]
rmse : 0.171817
----------

#### LGBMRegressor  prediction(default) ####
y_train[:7] : [1 1 1 1 1 1 1]
y_pred_train[:7] : [1.1 1.  1.  1.1 1.  1.  1. ]

y_val[:7] : [1 1 1 1 1 1 1]
y_pred_val[:7] : [1.  1.1 1.  1.  1.  1.  1.1]
rmse : 0.160891
----------

#### XGBRegressor  prediction(default) ####
y_train[:7] : [1 1 1 1 1 1 1]
y_pred_train[:7] : [1.1 1.  1.  1.1 1.  1.  1. ]

y_val[:7] : [1 1 1 1 1 1 1]
y_pred_val[:7] : [1.  1.1 1.  1.  1.  1.  1.1]
rmse : 0.165250
----------

#### GradientBoostingRegressor  prediction(default) ####
y_train[:7] : [1 1 1 1 1 1 1]
y_pred_train[:7] : [1.2 1.  1.  1.1 1.  1.  1. ]

y_val[:7] : [1 1 1 1 1 1 1]
y_pred_val[:7] : [1.  1.1 1.  1.  1.  1.  1.1]
rmse : 0.161544
----------

seperated model evaluation ended

### average model

In [28]:
y_pred = average_model_prediction(default_models, X_test)
print(y_pred)


####### average model #######


#### RandomForestRegressor prediction(default) ####
(255600, 19)
y_pred_test[:20] : [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
#### LGBMRegressor prediction(default) ####
(255600, 19)
y_pred_test[:20] : [1.  1.  1.  1.  1.  1.2 1.  1.  1.  1.  1.  1.  1.  1.1 1.  1.  1.  1.
 1.  1. ]
#### XGBRegressor prediction(default) ####
(255600, 19)
y_pred_test[:20] : [1.  1.  1.  1.  1.  1.2 1.  1.  1.  1.  1.  1.  1.  1.1 1.  1.  1.  1.
 1.  1. ]
#### GradientBoostingRegressor prediction(default) ####
(255600, 19)
y_pred_test[:20] : [1.  1.  1.  1.  1.  1.1 1.  1.  1.  1.  1.  1.  1.  1.1 1.  1.  1.  1.
 1.  1. ]
[1.00007483 0.99996857 1.00012543 ... 0.93866545 0.99964852 0.93866545]


In [29]:
mean_squared_error(real_count, y_pred)

0.03948857802555759