In [9]:
import pandas as pd
import numpy as np

# 데이터 불러오기
df = pd.read_csv('/Users/t2023-m0069/Desktop/Github/Omakase/Courses.csv')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 641138 entries, 0 to 641137
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   index              641138 non-null  int64  
 1   course_id          641138 non-null  object 
 2   userid_DI          641138 non-null  object 
 3   registered         641138 non-null  int64  
 4   viewed             641138 non-null  int64  
 5   explored           641138 non-null  int64  
 6   certified          641138 non-null  int64  
 7   final_cc_cname_DI  641138 non-null  object 
 8   LoE_DI             535130 non-null  object 
 9   YoB                544533 non-null  float64
 10  gender             554332 non-null  object 
 11  grade              592766 non-null  object 
 12  start_time_DI      641138 non-null  object 
 13  last_event_DI      462184 non-null  object 
 14  nevents            441987 non-null  float64
 15  ndays_act          478395 non-null  float64
 16  np

In [11]:
# grade 공백 → NaN, 숫자 변환 1.01 → 1.0
df['grade'] = pd.to_numeric(df['grade'].replace(' ', np.nan), errors='coerce')
df['grade'] = df['grade'].replace(1.01, 1.0)

# NaN 유지하면서 정수형 변환 (결측치 있을 수 있으므로 object로 남을 수 있음)
df['grade'] = df['grade'].apply(lambda x: int(x) if pd.notnull(x) else np.nan)

# viewed가 False인데 활동지표가 모두 채워진 경우 제거
conditions = ['nevents', 'ndays_act', 'nchapters', 'nforum_posts']
cond_all_filled = (df['viewed'] == False) & df[conditions].notnull().all(axis=1)
df = df[~cond_all_filled]

# viewed가 False인데 일부 지표만 채워진 경우 → 해당 값들 0으로
cond_partial_filled = (df['viewed'] == False) & df[conditions].notnull().any(axis=1)
df.loc[cond_partial_filled, conditions] = df.loc[cond_partial_filled, conditions].fillna(0)

# 날짜 변환 및 오류 제거
df['start_time_DI'] = pd.to_datetime(df['start_time_DI'], errors='coerce')
df['last_event_DI'] = pd.to_datetime(df['last_event_DI'], errors='coerce')
df = df[df['last_event_DI'] >= df['start_time_DI']]

# YoB 이상치 제거 (IQR 기준 상위만)
q3 = df['YoB'].quantile(0.75)
iqr = q3 - df['YoB'].quantile(0.25)
upper_bound = q3 + 1.5 * iqr
df = df[df['YoB'] <= upper_bound]

# 나이 파생 변수 생성
df['age'] = 2025 - df['YoB']
df['age'] = df['age'].where(df['YoB'].notnull(), 'Unknown')

# 결측치 Unknown 처리
df['LoE_DI'] = df['LoE_DI'].fillna('Unknown')
df['gender'] = df['gender'].fillna('Unknown')

# roles 컬럼 제거
df = df.drop(columns=['roles'])


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 383879 entries, 19332 to 641136
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   index              383879 non-null  int64         
 1   course_id          383879 non-null  object        
 2   userid_DI          383879 non-null  object        
 3   registered         383879 non-null  int64         
 4   viewed             383879 non-null  int64         
 5   explored           383879 non-null  int64         
 6   certified          383879 non-null  int64         
 7   final_cc_cname_DI  383879 non-null  object        
 8   LoE_DI             383879 non-null  object        
 9   YoB                383879 non-null  float64       
 10  gender             383879 non-null  object        
 11  grade              345636 non-null  float64       
 12  start_time_DI      383879 non-null  datetime64[ns]
 13  last_event_DI      383879 non-null  datetime6

In [13]:
print(df.shape)
print(df.columns)
df.head()

(383879, 21)
Index(['index', 'course_id', 'userid_DI', 'registered', 'viewed', 'explored',
       'certified', 'final_cc_cname_DI', 'LoE_DI', 'YoB', 'gender', 'grade',
       'start_time_DI', 'last_event_DI', 'nevents', 'ndays_act', 'nplay_video',
       'nchapters', 'nforum_posts', 'incomplete_flag', 'age'],
      dtype='object')


Unnamed: 0,index,course_id,userid_DI,registered,viewed,explored,certified,final_cc_cname_DI,LoE_DI,YoB,...,grade,start_time_DI,last_event_DI,nevents,ndays_act,nplay_video,nchapters,nforum_posts,incomplete_flag,age
19332,19332,HarvardX/PH207x/2012_Fall,MHxPC130313697,1,0,0,0,India,Bachelor's,1989.0,...,0.0,2012-07-24,2013-07-27,6.0,3.0,,0.0,0,,36.0
19334,19334,HarvardX/PH207x/2012_Fall,MHxPC130237753,1,1,0,0,United States,Secondary,1993.0,...,0.0,2012-07-24,2012-12-24,107.0,8.0,7.0,2.0,0,,32.0
19335,19335,HarvardX/CS50x/2012,MHxPC130202970,1,1,0,0,United States,Bachelor's,1988.0,...,0.0,2012-07-24,2013-03-28,8.0,1.0,,1.0,0,,37.0
19348,19348,HarvardX/CS50x/2012,MHxPC130223941,1,1,0,0,Other Middle East/Central Asia,Secondary,1992.0,...,0.0,2012-07-24,2013-07-15,25.0,2.0,,4.0,0,,33.0
19350,19350,HarvardX/PH207x/2012_Fall,MHxPC130317399,1,0,0,0,Australia,Master's,1980.0,...,0.0,2012-07-24,2012-08-25,3.0,2.0,,0.0,0,,45.0
