## 데이터 확인하기
컬럼명이 원래 위치에서 오른쪽으로 하나씩 밀려 있는 데이터셋 편집하기  
-> index_col = False로 설정하지 않아도 정상적으로 불러올 수 있도록!

In [1]:
import pandas as pd

In [37]:
df = pd.read_csv('./202308_서울시_지하철호선별_역별_승하자_인원정보.csv', index_col=False)
df

Unnamed: 0,사용일자,노선명,역명,승차총승객수,하차총승객수,등록일자
0,20230801,1호선,서울역,52929,50606,20230804
1,20230801,2호선,선릉,62329,54557,20230804
2,20230801,중앙선,지평,71,85,20230804
3,20230801,중앙선,용문,2393,2407,20230804
4,20230801,중앙선,원덕,318,295,20230804
...,...,...,...,...,...,...
18908,20230831,일산선,화정,18606,18883,20230903
18909,20230831,일산선,원당,11858,11312,20230903
18910,20230831,일산선,삼송,21185,20290,20230903
18911,20230831,일산선,지축,1,0,20230903


In [23]:
df = pd.read_csv('./202308_서울시_지하철호선별_역별_승하자_인원정보.csv')
df

Unnamed: 0,사용일자,노선명,역명,승차총승객수,하차총승객수,등록일자
20230801,1호선,서울역,52929,50606,20230804,
20230801,2호선,선릉,62329,54557,20230804,
20230801,중앙선,지평,71,85,20230804,
20230801,중앙선,용문,2393,2407,20230804,
20230801,중앙선,원덕,318,295,20230804,
...,...,...,...,...,...,...
20230831,일산선,화정,18606,18883,20230903,
20230831,일산선,원당,11858,11312,20230903,
20230831,일산선,삼송,21185,20290,20230903,
20230831,일산선,지축,1,0,20230903,


데이터와 컬럼이 불일치하므로 수정이 필요함

In [24]:
# 잘못 설정된 컬럼명 확인
old_columns = df.columns.to_list()
old_columns

['사용일자', '노선명', '역명', '승차총승객수', '하차총승객수', '등록일자']

In [25]:
# 잘못 추가된 행 삭제
df.drop(columns=['등록일자'], inplace=True)
df

Unnamed: 0,사용일자,노선명,역명,승차총승객수,하차총승객수
20230801,1호선,서울역,52929,50606,20230804
20230801,2호선,선릉,62329,54557,20230804
20230801,중앙선,지평,71,85,20230804
20230801,중앙선,용문,2393,2407,20230804
20230801,중앙선,원덕,318,295,20230804
...,...,...,...,...,...
20230831,일산선,화정,18606,18883,20230903
20230831,일산선,원당,11858,11312,20230903
20230831,일산선,삼송,21185,20290,20230903
20230831,일산선,지축,1,0,20230903


In [26]:
# 인덱스 확인
df.index

Index([20230801, 20230801, 20230801, 20230801, 20230801, 20230801, 20230801,
       20230801, 20230801, 20230801,
       ...
       20230831, 20230831, 20230831, 20230831, 20230831, 20230831, 20230831,
       20230831, 20230831, 20230831],
      dtype='int64', length=18913)

사용일자로 활용되어야 하는 컬럼이 인덱스로 잘못 설정된 점을 발견함

In [27]:
# 인덱스 재설정하기 
df.reset_index(inplace=True)
df

Unnamed: 0,index,사용일자,노선명,역명,승차총승객수,하차총승객수
0,20230801,1호선,서울역,52929,50606,20230804
1,20230801,2호선,선릉,62329,54557,20230804
2,20230801,중앙선,지평,71,85,20230804
3,20230801,중앙선,용문,2393,2407,20230804
4,20230801,중앙선,원덕,318,295,20230804
...,...,...,...,...,...,...
18908,20230831,일산선,화정,18606,18883,20230903
18909,20230831,일산선,원당,11858,11312,20230903
18910,20230831,일산선,삼송,21185,20290,20230903
18911,20230831,일산선,지축,1,0,20230903


In [28]:
df.columns

Index(['index', '사용일자', '노선명', '역명', '승차총승객수', '하차총승객수'], dtype='object')

In [29]:
new_column = {}
for i, j in zip(df.columns, old_columns):
    new_column[i] = j
new_column

{'index': '사용일자',
 '사용일자': '노선명',
 '노선명': '역명',
 '역명': '승차총승객수',
 '승차총승객수': '하차총승객수',
 '하차총승객수': '등록일자'}

In [30]:
df.rename(columns=new_column, inplace=True)
df

Unnamed: 0,사용일자,노선명,역명,승차총승객수,하차총승객수,등록일자
0,20230801,1호선,서울역,52929,50606,20230804
1,20230801,2호선,선릉,62329,54557,20230804
2,20230801,중앙선,지평,71,85,20230804
3,20230801,중앙선,용문,2393,2407,20230804
4,20230801,중앙선,원덕,318,295,20230804
...,...,...,...,...,...,...
18908,20230831,일산선,화정,18606,18883,20230903
18909,20230831,일산선,원당,11858,11312,20230903
18910,20230831,일산선,삼송,21185,20290,20230903
18911,20230831,일산선,지축,1,0,20230903


In [31]:
df.to_csv('./edited_202308_서울시_지하철호선별_역별_승하자_인원정보.csv', index=False)