# 2주차 과제 제출에 포함되어야 하는 형태 (최소)
1. 모델링에 대한 결과 .ipynb 파일 (team1_onboarding_week2_analysis.ipynb)
2. np.random.seed(42) 설정
3. Input : X데이터 : [user_id, +@]
4. Output : Count column 예측값
5. 2020.01.01~2020.09.30 : Training Data
6. 2020.10.01~2020.12.31 : Test Data
7. 2020.10.01~2020.12.31 구간에 대한 MSE, MAE 계산값
8. 요일별 이용량에 대한 분석 - EDA
9. 유저별 이용 count 값에 대한 분석 - EDA

In [1]:
import os 
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'Malgun Gothic'  # (Windows 용) 한글 출력을 위한 글꼴 설정

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

# 랜덤시드 통일
np.random.seed(42)

In [2]:
def get_font_family():
    """
    시스템 환경에 따른 기본 폰트명을 반환하는 함수
    """
    import platform
    system_name = platform.system()
    # colab 사용자는 system_name이 'Linux'로 확인

    if system_name == "Darwin" :
        font_family = "AppleGothic"
    elif system_name == "Windows":
        font_family = "Malgun Gothic"
    else:
        # Linux
        # colab에서는 runtime을 <꼭> 재시작 해야함.
        # 런타임을 재시작 하지 않고 폰트 설치를 하면 기본 설정 폰트가 로드되어 한글이 깨짐.
        !apt-get update -qq
        !apt-get install fonts-nanum -qq  > /dev/null

        import matplotlib.font_manager as fm

        fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
        font = fm.FontProperties(fname=fontpath, size=9)
        fm._rebuild()
        font_family = "NanumBarunGothic"
    return font_family

In [3]:
# 시각화를 위한 폰트설정
# 위에서 만든 함수를 통해 시스템 폰트를 불러와서 font_family 라는 변수에 할당.
a = get_font_family()
# 폰트설정
import matplotlib.pyplot as plt 
plt.rc("font", family = a)
# 마이너스폰트 설정
plt.rc("axes", unicode_minus=False)
# ggplot으로 그래프 스타일 설정 / 개인 자유
plt.style.use("ggplot")

In [4]:
# 데이터 확인
os.listdir('data')

['2020교통량통합.xlsx',
 'check.csv',
 'final.csv',
 'holiday.csv',
 'metro.csv',
 '교통량정리.csv',
 '국가공휴일.xlsx',
 '기상청.csv',
 '디지털 스킬셋 기술과제.docx',
 '서울시_기상데이터.csv',
 '실전db.csv',
 '실전db_holiday.csv',
 '지하철노선위경도정보3.xlsx',
 '최종.csv']

In [5]:
# 데이터프레임 불러오기
df = pd.read_csv("./data/최종.csv")
df = df.drop("Unnamed: 0",axis=1)
df = df.sort_values(by="DATE")

In [6]:
display(df)

Unnamed: 0,DATE,USER_ID,JOIN_DATE_1970,JOIN_DATE_2013,JOIN_DATE_2014,JOIN_DATE_2015,JOIN_DATE_2016,JOIN_DATE_2017,JOIN_DATE_2018,JOIN_DATE_2019,...,AD1_TYPE_SEOUL,DAY_TYPE_주중,DAY_TYPE_주말,DAY_TYPE_공휴일,강수여부,이상기온,USER_ID_TYPE_A,USER_ID_TYPE_B,USER_ID_TYPE_C,COUNT
0,2020-01-01,2858,0,0,1,0,0,0,0,0,...,1,0.0,0.0,1.0,1.0,0.0,1,0,0,1
10072,2020-01-01,1407470,0,0,0,0,0,0,0,1,...,1,0.0,0.0,1.0,1.0,0.0,0,0,1,1
10084,2020-01-01,1409746,0,0,0,0,0,0,0,1,...,1,0.0,0.0,1.0,1.0,0.0,1,0,0,1
10085,2020-01-01,1411616,0,0,0,0,0,0,0,1,...,1,0.0,0.0,1.0,1.0,0.0,1,0,0,1
10088,2020-01-01,1412226,0,0,0,0,0,0,0,1,...,1,0.0,0.0,1.0,1.0,0.0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
780020,2020-12-31,1685369,0,0,0,0,0,0,0,0,...,1,1.0,0.0,0.0,0.0,1.0,1,0,0,1
818749,2020-12-31,527822,0,0,0,0,0,1,0,0,...,1,1.0,0.0,0.0,0.0,1.0,1,0,0,1
839489,2020-12-31,1752193,0,0,0,0,0,0,0,0,...,1,1.0,0.0,0.0,0.0,1.0,1,0,0,1
356649,2020-12-31,900857,0,0,0,0,0,0,1,0,...,1,1.0,0.0,0.0,0.0,1.0,1,0,0,1


In [7]:
df = df.drop(["JOIN_DATE_1970", "JOIN_DATE_2013", "JOIN_DATE_2014", "JOIN_DATE_2015", "JOIN_DATE_2016", "JOIN_DATE_2017", "JOIN_DATE_2018", "JOIN_DATE_2019", "JOIN_DATE_2020"], axis=1)

In [8]:
X_train = df[:623305]
X_test = df[623305:]

In [9]:
train_x = X_train.drop(["DATE", "COUNT"], axis=1)
train_y = X_train['COUNT']

In [10]:
test_x = X_test.drop(["DATE", "COUNT"], axis=1)
real_count = X_test['COUNT']

In [11]:
lgb = lgb.LGBMRegressor(
    learning_rate = 0.05, 
    max_depth = 20, 
    num_iterations = 200,
    n_estimators=10,
    random_state=42
)

In [12]:
lgb.fit(train_x, train_y)

LGBMRegressor(learning_rate=0.05, max_depth=20, n_estimators=10,
              num_iterations=200, random_state=42)

In [13]:
pred = lgb.predict(test_x)

In [14]:
from sklearn.metrics import mean_squared_error

mean_squared_error(real_count, pred)


# n_estimators=500, random_state=42 // 0.0400809582729114

0.03598492097136129