In [264]:
# 데이터 처리 및 분석
import numpy as np
import pandas as pd
from datetime import datetime

pd.set_option('display.max_columns', None)  # 모든 컬럼 다 보이게 설정

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
import scipy.stats as stats
from scipy.stats import shapiro

# 지리 라이브러리
from geopy.distance import geodesic

# 전처리용
from itertools import combinations

# 시각화
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.font_manager as fm

# 윈도우용 한글 폰트 설정
plt.rc('font', family='Malgun Gothic')  # 말굿 고딕 (Windows 기본 한글 폰트)
# 마이너스 기호 깨짐 방지
plt.rcParams['axes.unicode_minus'] = False

# 머신러닝 - 전처리, 모델, 평가
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# 머신러닝 알고리즘 (필요에 따라 선택적으로 추가)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier, XGBRegressor

# 텍스트 처리
from rapidfuzz import process, fuzz

# 경고 메시지 무시
import warnings
warnings.filterwarnings('ignore')

# # # 딥러닝 (선택사항)
# import tensorflow as tf
# from tensorflow import keras
# from keras.models import Sequential
# from keras.layers import Dense, Dropout

# 모델 해석
import shap



In [267]:
df = pd.read_csv('./data/schedule.csv', index_col='고유번호')
df['호선'] = pd.to_numeric(df['호선'], errors='coerce')
df2 = pd.read_csv('./data/train_subway.csv', encoding='cp949', index_col='연번').dropna(subset='역명')

In [268]:
subway_list = df2[['호선', '역명']].drop_duplicates().reset_index(drop=True)
subway_list['호선'] = subway_list['호선'].str.replace('호선', '').astype(int)
subway_list = subway_list.rename(columns={'역명':'역사명'})
# 조건만 발라내기

In [269]:
df.iloc[117173:117180,:][(df.역사명 == '건대입구') & (df.호선 == 7)]

Unnamed: 0_level_0,호선,역사코드,역사명,주중주말,방향,급행여부,열차코드,열차도착시간,열차출발시간,출발역,도착역
고유번호,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
117174,7.0,2729,건대입구,DAY,UP,0,7006,05:43:20,05:44:00,청담,장암
117175,7.0,2729,건대입구,DAY,UP,0,7008,05:55:10,05:55:50,내방,장암
117176,7.0,2729,건대입구,DAY,UP,0,7010,06:08:10,06:08:40,보라매,장암
117177,7.0,2729,건대입구,DAY,UP,0,7012,06:19:30,06:20:00,온수,장암
117178,7.0,2729,건대입구,DAY,UP,0,7014,06:29:20,06:30:00,온수,장암
117179,7.0,2729,건대입구,DAY,UP,0,7016,06:38:20,06:39:00,석남,도봉산
117180,7.0,2729,건대입구,DAY,UP,0,7018,06:46:20,06:47:00,온수,장암


In [270]:
df2 = subway_list.merge(df, how='left', on = ['호선','역사명'])
df2.drop(columns=['역사코드'])

Unnamed: 0,호선,역사명,주중주말,방향,급행여부,열차코드,열차도착시간,열차출발시간,출발역,도착역
0,1,서울역,DAY,UP,0,K802,05:19:30,05:20:00,구로,동두천
1,1,서울역,DAY,UP,0,K804,05:29:30,05:30:00,구로,청량리
2,1,서울역,DAY,UP,0,S902,,05:24:00,서울역,의정부
3,1,서울역,DAY,UP,0,K2,05:47:30,05:48:00,부평,소요산
4,1,서울역,DAY,UP,0,K806,05:39:30,05:40:00,구로,동두천
...,...,...,...,...,...,...,...,...,...,...
12754,7,건대입구,SAT,UP,0,7338,23:19:20,23:20:00,석남,장암
12755,7,건대입구,SAT,UP,0,7340,23:29:20,23:30:00,석남,도봉산
12756,7,건대입구,SAT,UP,0,7342,23:39:20,23:40:00,석남,도봉산
12757,7,건대입구,SAT,UP,0,7344,23:49:20,23:50:00,온수,태릉입구


In [271]:
schedule = df2[['호선', '역사명', '주중주말', '열차도착시간', '열차출발시간']]
schedule['열차운행시간'] = schedule['열차도착시간'].fillna(schedule['열차출발시간'])
schedule = schedule[['호선', '역사명', '주중주말', '열차운행시간']]
schedule['시간'] = pd.to_datetime(schedule['열차운행시간']).dt.hour.astype(int)


In [272]:
schedule

Unnamed: 0,호선,역사명,주중주말,열차운행시간,시간
0,1,서울역,DAY,05:19:30,5
1,1,서울역,DAY,05:29:30,5
2,1,서울역,DAY,05:24:00,5
3,1,서울역,DAY,05:47:30,5
4,1,서울역,DAY,05:39:30,5
...,...,...,...,...,...
12754,7,건대입구,SAT,23:19:20,23
12755,7,건대입구,SAT,23:29:20,23
12756,7,건대입구,SAT,23:39:20,23
12757,7,건대입구,SAT,23:49:20,23


In [284]:
grp = schedule.groupby(by=['호선','역사명','주중주말','시간']).agg(
    운행횟수 = ('시간','count')
)
grp.to_csv('./data2/subway_schedule.csv')
grp_df = pd.DataFrame(grp)

In [290]:
grp_df = grp_df.reset_index()

In [291]:
# step 1: 각 그룹별로 'END'가 존재하는지 확인
has_end = grp_df.reset_index().groupby(['호선', '역사명'])['주중주말'].apply(lambda x: 'END' in x.values)
# step 2: END가 없는 그룹만 필터링
no_end_groups = has_end[~has_end].index

# step 3: 해당 그룹의 SAT 데이터를 복제하고 주중주말을 END로 변경
new_rows = []
for line, station in no_end_groups:
    sat_rows = grp_df[(grp_df['호선'] == line) & (grp_df['역사명'] == station) & (grp_df['주중주말'] == 'SAT')]
    sat_as_end = sat_rows.copy()
    sat_as_end['주중주말'] = 'END'
    new_rows.append(sat_as_end)

# step 4: 원본 데이터프레임에 추가
if new_rows:
    grp_df = pd.concat([grp_df] + new_rows, ignore_index=True)

# 확인
grp_df['주중주말'].value_counts()

주중주말
SAT    196
END    196
DAY    194
Name: count, dtype: int64

In [296]:
grp_df.to_csv('./data2/subway_schedule.csv', index=False)