In [1]:
import pandas as pd
import numpy as np

In [2]:
# 데이터 파일 경로 설정
data_path = "preprocessed_course_data.csv"

# 데이터 로드
df = pd.read_csv(data_path)

In [3]:
# Course ID가 있으면 파싱 실행
if 'course_id' in df.columns:
    print("\n=== Course ID 파싱 시작 ===")

    # 강의 ID와 제목 매핑 딕셔너리
    course_dict = {
        "HarvardX/CB22x/2013_Spring": "The Ancient Greek Hero",
        "HarvardX/CS50x/2012": "Introduction to Computer Science",
        "HarvardX/ER22x/2013_Spring": "Justice",
        "HarvardX/PH207x/2012_Fall": "Health in Numbers: Quantitative Methods in Clinical & Public Health Research",
        "HarvardX/PH278x/2013_Spring": "Human Health and Global Environmental Change",
        "MITx/6.002x/2012_Fall": "Circuits and Electronics",
        "MITx/6.002x/2013_Spring": "Circuits and Electronics",
        "MITx/14.73x/2013_Spring": "The Challenges of Global Poverty",
        "MITx/2.01x/2013_Spring": "Elements of Structures",
        "MITx/3.091x/2012_Fall": "Introduction to Solid State Chemistry",
        "MITx/3.091x/2013_Spring": "Introduction to Solid State Chemistry",
        "MITx/6.00x/2012_Fall": "Introduction to Computer Science and Programming",
        "MITx/6.00x/2013_Spring": "Introduction to Computer Science and Programming",
        "MITx/7.00x/2013_Spring": "Introduction to Biology - The Secret of Life",
        "MITx/8.02x/2013_Spring": "Physics II: Electricity and Magnetism",
        "MITx/8.MReV/2013_Summer": "Mechanics ReView"
    }

    # Course ID 분해
    uni_course_seme = df['course_id'].str.split('/', expand=True)
    uni_course_seme.columns = ['university', 'course_code', 'semester']

    # 년도와 학기 분리
    date = uni_course_seme['semester'].str.split('_', expand=True)
    date.columns = ['year', 'semester']

    # 새로운 컬럼들 결합
    new_col = pd.concat([uni_course_seme[['university', 'course_code']], date], axis=1)
    new_col = new_col.replace([None], np.nan)

    # 원본 데이터프레임에 추가
    df = pd.concat([df, new_col], axis=1)

    # 강의 제목 매핑
    df['course_title'] = df['course_id'].map(course_dict)

    print("Course ID 파싱 완료 ✅")
    print(f"University 분포:\n{df['university'].value_counts()}")
    print(f"Year 분포:\n{df['year'].value_counts()}")

else:
    print("\ncourse_id 컬럼이 없습니다. Course 파싱을 건너뜁니다.")

# 최종 데이터 상태 확인
df.head()


=== Course ID 파싱 시작 ===
Course ID 파싱 완료 ✅
University 분포:
university
HarvardX    337183
MITx        301894
Name: count, dtype: int64
Year 분포:
year
2012    331814
2013    307263
Name: count, dtype: int64


Unnamed: 0,index,course_id,userid_DI,registered,viewed,explored,certified,final_cc_cname_DI,LoE_DI,YoB,...,ndays_act,nplay_video,nchapters,nforum_posts,incomplete_flag,university,course_code,year,semester,course_title
0,0,HarvardX/CB22x/2013_Spring,MHxPC130442623,True,False,False,False,United States,Unknown,,...,0.0,,,0,True,HarvardX,CB22x,2013,Spring,The Ancient Greek Hero
1,1,HarvardX/CS50x/2012,MHxPC130442623,True,True,False,False,United States,Unknown,,...,9.0,,1.0,0,True,HarvardX,CS50x,2012,,Introduction to Computer Science
2,2,HarvardX/CB22x/2013_Spring,MHxPC130275857,True,False,False,False,United States,Unknown,,...,0.0,,,0,True,HarvardX,CB22x,2013,Spring,The Ancient Greek Hero
3,3,HarvardX/CS50x/2012,MHxPC130275857,True,False,False,False,United States,Unknown,,...,0.0,,,0,True,HarvardX,CS50x,2012,,Introduction to Computer Science
4,4,HarvardX/ER22x/2013_Spring,MHxPC130275857,True,False,False,False,United States,Unknown,,...,0.0,,,0,True,HarvardX,ER22x,2013,Spring,Justice


In [4]:
df = df.drop(columns=['index','course_id'])

In [5]:
# start_time_DI 변환
if 'start_time_DI' in df.columns:
    df['start_time_DI'] = pd.to_datetime(df['start_time_DI'], errors='coerce')

# last_event_DI 변환
if 'last_event_DI' in df.columns:
    df['last_event_DI'] = pd.to_datetime(df['last_event_DI'], errors='coerce')

In [7]:
# 1. last_event가 null이면 start_time과 같게 만들기
df['last_event_clean'] = df['last_event_DI'].fillna(df['start_time_DI'])

# 2. 날짜 차이 계산
df['study_days'] = (df['last_event_clean'] - df['start_time_DI']).dt.days

# 3. 혹시 음수면 0으로
df['study_days'] = df['study_days'].fillna(0).astype(int)
df.loc[df['study_days'] < 0, 'study_days'] = 0

# 4. 임시 컬럼 삭제
df = df.drop(columns=['last_event_clean'])

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 639077 entries, 0 to 639076
Data columns (total 24 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   userid_DI          639077 non-null  object        
 1   registered         639077 non-null  bool          
 2   viewed             639077 non-null  bool          
 3   explored           639077 non-null  bool          
 4   certified          639077 non-null  bool          
 5   final_cc_cname_DI  639077 non-null  object        
 6   LoE_DI             639077 non-null  object        
 7   YoB                542781 non-null  float64       
 8   gender             639077 non-null  object        
 9   grade              582339 non-null  float64       
 10  start_time_DI      639077 non-null  datetime64[ns]
 11  last_event_DI      460454 non-null  datetime64[ns]
 12  nevents            440365 non-null  float64       
 13  ndays_act          476647 non-null  float64 

In [9]:
# YoB로 나이 계산
df['age'] = np.where(df['YoB'].notna(), 2013 - df['YoB'], np.nan)

In [10]:
df = df.drop(columns=['YoB'])

In [11]:
df['age'].describe()

count    542781.000000
mean         27.778351
std           8.853174
min          10.000000
25%          22.000000
50%          25.000000
75%          31.000000
max          82.000000
Name: age, dtype: float64

In [12]:
# 저장 경로 설정 (현재 폴더)
output_path = "./preprocessed_course_data.csv"
df.to_csv(output_path, index=False, encoding='utf-8-sig')
print(f"전처리 완료 데이터 저장: {output_path}")

전처리 완료 데이터 저장: ./preprocessed_course_data.csv
