In [None]:
import pandas as pd
import matplotlib.pyplot as plt 
plt.rc('font',family='AppleGothic')
import warnings
warnings.filterwarnings(action='ignore')

df_raw=pd.read_csv('../data/raw_data/train.csv')
df=df_raw.copy()

sample_submission_raw=pd.read_csv('../data/raw_data/sample_submission.csv')
sample_submission=sample_submission_raw.copy()

In [None]:
df['일시']=pd.to_datetime(df['일시'],format="%Y%m%d")
df.set_index('일시',inplace=True)

df['일'] = df.index.day
df['월'] = df.index.month
df['년'] = df.index.year

df['요일']=df.index.day_of_week
df['주말평일'] = df['요일'].apply(lambda x: '평일' if x < 5 else '주말')
# --

sample_submission['일시']=pd.to_datetime(sample_submission['일시'],format="%Y%m%d")
sample_submission.set_index('일시',inplace=True)

sample_submission['일'] = sample_submission.index.day
sample_submission['월'] = sample_submission.index.month
sample_submission['년'] = sample_submission.index.year

sample_submission['요일']=sample_submission.index.day_of_week
sample_submission['주말평일'] = sample_submission['요일'].apply(lambda x: '평일' if x < 5 else '주말')

In [None]:
df.head()

In [None]:
sample_submission.head()

In [None]:
from pytimekr import pytimekr
def get_holiday(_year):
    holidays=[]
    for holiday in pytimekr.holidays(year=_year):
        if pytimekr.red_days(holiday) != None:
            ans = [i.strftime("%Y-%m-%d") for i in pytimekr.red_days(holiday)]
            holidays.extend(ans)
        else:
            ans=holiday.strftime("%Y-%m-%d")
            holidays.append(ans)
    return list(set(holidays))

In [None]:
df.loc[get_holiday(2018),'주말평일'] = '주말'
df.loc[get_holiday(2019),'주말평일'] = '주말'
df.loc[get_holiday(2020),'주말평일'] = '주말'
df.loc[get_holiday(2021),'주말평일'] = '주말'
sample_submission.loc[sorted(get_holiday(2022))[:-1],'주말평일'] ='주말'

In [None]:
df.drop(df[df['월']==12].index,inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder,RobustScaler
lbe = LabelEncoder()
rbs = RobustScaler()
df['주말평일'] = lbe.fit_transform(df['주말평일'])
sample_submission['주말평일'] = lbe.transform(sample_submission['주말평일'])

In [None]:
df

In [None]:
df_2018 = df[df.index.year == 2018]
df_2019 = df[df.index.year == 2019]
df_2020 = df[df.index.year == 2020]
df_2021 = df[df.index.year == 2021]

In [None]:
plt.figure(figsize=(14,7))
df_2018.iloc[:,1].reset_index(drop=True).plot()
df_2019.iloc[:,1].reset_index(drop=True).plot()
df_2020.iloc[:,1].reset_index(drop=True).plot()
df_2021.iloc[:,1].reset_index(drop=True).plot()
plt.tight_layout()
plt.show()

In [None]:
df_2018.reset_index()
import seaborn as sns
plt.figure(figsize=(14,10))
plt.subplot(2,2,1)
sns.scatterplot(data=df_2018.reset_index(),
                x='일시',
                y='광진구',
                hue='주말평일')
plt.subplot(2,2,2)
sns.scatterplot(data=df_2018.reset_index(),
                x='일시',
                y='동대문구',
                hue='주말평일')
plt.subplot(2,2,3)
sns.scatterplot(data=df_2018.reset_index(),
                x='일시',
                y='성동구',
                hue='주말평일')
plt.subplot(2,2,4)
sns.scatterplot(data=df_2018.reset_index(),
                x='일시',
                y='중랑구',
                hue='주말평일')
plt.tight_layout()
plt.show()

In [None]:
df_2019.reset_index()
import seaborn as sns
plt.figure(figsize=(14,10))
plt.subplot(2,2,1)
sns.scatterplot(data=df_2019.reset_index(),
                x='일시',
                y='광진구',
                hue='주말평일')
plt.subplot(2,2,2)
sns.scatterplot(data=df_2019.reset_index(),
                x='일시',
                y='동대문구',
                hue='주말평일')
plt.subplot(2,2,3)
sns.scatterplot(data=df_2019.reset_index(),
                x='일시',
                y='성동구',
                hue='주말평일')
plt.subplot(2,2,4)
sns.scatterplot(data=df_2019.reset_index(),
                x='일시',
                y='중랑구',
                hue='주말평일')
plt.tight_layout()
plt.show()

In [None]:
df_2020.reset_index()
import seaborn as sns
plt.figure(figsize=(14,10))
plt.subplot(2,2,1)
sns.scatterplot(data=df_2020.reset_index(),
                x='일시',
                y='광진구',
                hue='주말평일')
plt.subplot(2,2,2)
sns.scatterplot(data=df_2020.reset_index(),
                x='일시',
                y='동대문구',
                hue='주말평일')
plt.subplot(2,2,3)
sns.scatterplot(data=df_2020.reset_index(),
                x='일시',
                y='성동구',
                hue='주말평일')
plt.subplot(2,2,4)
sns.scatterplot(data=df_2020.reset_index(),
                x='일시',
                y='중랑구',
                hue='주말평일')
plt.tight_layout()
plt.show()

In [None]:
df_2021.reset_index()
import seaborn as sns
plt.figure(figsize=(14,10))
plt.subplot(2,2,1)
sns.scatterplot(data=df_2021.reset_index(),
                x='일시',
                y='광진구',
                hue='주말평일')
plt.subplot(2,2,2)
sns.scatterplot(data=df_2021.reset_index(),
                x='일시',
                y='동대문구',
                hue='주말평일')
plt.subplot(2,2,3)
sns.scatterplot(data=df_2021.reset_index(),
                x='일시',
                y='성동구',
                hue='주말평일')
plt.subplot(2,2,4)
sns.scatterplot(data=df_2021.reset_index(),
                x='일시',
                y='중랑구',
                hue='주말평일')
plt.tight_layout()
plt.show()

In [None]:
temp=df_2018.groupby('요일').mean()

temp
import seaborn as sns
plt.figure(figsize=(10,6))
plt.subplot(2,2,1)
sns.barplot(data=temp.reset_index(),
                x='요일',
                y='광진구')
plt.subplot(2,2,2)
sns.barplot(data=temp.reset_index(),
                x='요일',
                y='동대문구')
plt.subplot(2,2,3)
sns.barplot(data=temp.reset_index(),
                x='요일',
                y='성동구')
plt.subplot(2,2,4)
sns.barplot(data=temp.reset_index(),
                x='요일',
                y='중랑구')
plt.tight_layout()
plt.show()

In [None]:
temp=df_2021.groupby('요일').mean()

temp
import seaborn as sns
plt.figure(figsize=(10,6))
plt.subplot(2,2,1)
sns.barplot(data=temp.reset_index(),
                x='요일',
                y='광진구')
plt.subplot(2,2,2)
sns.barplot(data=temp.reset_index(),
                x='요일',
                y='동대문구')
plt.subplot(2,2,3)
sns.barplot(data=temp.reset_index(),
                x='요일',
                y='성동구')
plt.subplot(2,2,4)
sns.barplot(data=temp.reset_index(),
                x='요일',
                y='중랑구')
plt.tight_layout()
plt.show()

In [None]:
df.to_csv('../data/pps/train_data.csv')

In [None]:
sample_submission.to_csv('../data/pps/test_data.csv')