In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

# 1. 데이터 로드

In [7]:
data_path = "../Courses.csv"

# 데이터 로드
df = pd.read_csv(data_path)

print(f"✅ 데이터 로드 완료!")
print(f"   - 행 수: {df.shape[0]:,}")
print(f"   - 열 수: {df.shape[1]}")


✅ 데이터 로드 완료!
   - 행 수: 641,138
   - 열 수: 21


# 2. 데이터 기본 정보 확인

In [8]:
print("컬럼 정보:")
df.info()

print(f"\n첫 5행 데이터:")
display(df.head())

print(f"\n기술통계:")
display(df.describe())

# 결측치 확인
print(f"\n🔍 결측치 현황:")
missing_data = df.isnull().sum()
missing_percent = (df.isnull().sum() / len(df) * 100).round(2)

missing_df = pd.DataFrame({
    '결측치 수': missing_data,
    '결측치 비율(%)': missing_percent
})
display(missing_df[missing_df['결측치 수'] > 0])

# 중복행 확인
duplicate_count = df.duplicated().sum()
print(f"\n중복행: {duplicate_count}개")

컬럼 정보:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 641138 entries, 0 to 641137
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   index              641138 non-null  int64  
 1   course_id          641138 non-null  object 
 2   userid_DI          641138 non-null  object 
 3   registered         641138 non-null  int64  
 4   viewed             641138 non-null  int64  
 5   explored           641138 non-null  int64  
 6   certified          641138 non-null  int64  
 7   final_cc_cname_DI  641138 non-null  object 
 8   LoE_DI             535130 non-null  object 
 9   YoB                544533 non-null  float64
 10  gender             554332 non-null  object 
 11  grade              592766 non-null  object 
 12  start_time_DI      641138 non-null  object 
 13  last_event_DI      462184 non-null  object 
 14  nevents            441987 non-null  float64
 15  ndays_act          478395 non-null  float64


Unnamed: 0,index,course_id,userid_DI,registered,viewed,explored,certified,final_cc_cname_DI,LoE_DI,YoB,...,grade,start_time_DI,last_event_DI,nevents,ndays_act,nplay_video,nchapters,nforum_posts,roles,incomplete_flag
0,0,HarvardX/CB22x/2013_Spring,MHxPC130442623,1,0,0,0,United States,,,...,0,2012-12-19,2013-11-17,,9.0,,,0,,1.0
1,1,HarvardX/CS50x/2012,MHxPC130442623,1,1,0,0,United States,,,...,0,2012-10-15,,,9.0,,1.0,0,,1.0
2,2,HarvardX/CB22x/2013_Spring,MHxPC130275857,1,0,0,0,United States,,,...,0,2013-02-08,2013-11-17,,16.0,,,0,,1.0
3,3,HarvardX/CS50x/2012,MHxPC130275857,1,0,0,0,United States,,,...,0,2012-09-17,,,16.0,,,0,,1.0
4,4,HarvardX/ER22x/2013_Spring,MHxPC130275857,1,0,0,0,United States,,,...,0,2012-12-19,,,16.0,,,0,,1.0



기술통계:


Unnamed: 0,index,registered,viewed,explored,certified,YoB,nevents,ndays_act,nplay_video,nchapters,nforum_posts,roles,incomplete_flag
count,641138.0,641138.0,641138.0,641138.0,641138.0,544533.0,441987.0,478395.0,183608.0,382385.0,641138.0,0.0,100161.0
mean,320568.5,1.0,0.624299,0.061899,0.027587,1985.253279,431.008018,5.710254,114.844173,3.634423,0.018968,,1.0
std,185080.742781,0.0,0.484304,0.240973,0.163786,8.891814,1516.116057,11.866471,426.996844,4.490987,0.229539,,0.0
min,0.0,1.0,0.0,0.0,0.0,1931.0,1.0,1.0,1.0,1.0,0.0,,1.0
25%,160284.25,1.0,0.0,0.0,0.0,1982.0,3.0,1.0,5.0,1.0,0.0,,1.0
50%,320568.5,1.0,1.0,0.0,0.0,1988.0,24.0,2.0,18.0,2.0,0.0,,1.0
75%,480852.75,1.0,1.0,0.0,0.0,1991.0,158.0,4.0,73.0,4.0,0.0,,1.0
max,641137.0,1.0,1.0,1.0,1.0,2013.0,197757.0,205.0,98517.0,48.0,20.0,,1.0



🔍 결측치 현황:


Unnamed: 0,결측치 수,결측치 비율(%)
LoE_DI,106008,16.53
YoB,96605,15.07
gender,86806,13.54
grade,48372,7.54
last_event_DI,178954,27.91
nevents,199151,31.06
ndays_act,162743,25.38
nplay_video,457530,71.36
nchapters,258753,40.36
roles,641138,100.0



중복행: 0개


# 3. 데이터 전처리

In [9]:
# 원본 데이터 복사
df_clean = df.copy()

In [13]:
# 3-1. 날짜 컬럼 처리
date_columns = ['start_time_DI', 'last_event_DI']

for col in date_columns:
    if col in df_clean.columns:
        df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')
df_clean[date_columns].dtypes

start_time_DI    datetime64[ns]
last_event_DI    datetime64[ns]
dtype: object

In [None]:
# 3-2. 연령 계산
print("\n2️⃣ 연령 계산...")
if 'YoB' in df_clean.columns:
    current_year = 2025
    df_clean['age'] = current_year - df_clean['YoB']

    # 연령대 분류
    df_clean['age_group'] = pd.cut(df_clean['age'],
                                  bins=[0, 25, 35, 45, 55, 100],
                                  labels=['25세 이하', '26-35세', '36-45세', '46-55세', '56세 이상'])

    print(f"   ✅ 연령 계산 완료")
    print(f"   - 평균 연령: {df_clean['age'].mean():.1f}세")
    print(f"   - 연령 범위: {df_clean['age'].min()}세 ~ {df_clean['age'].max()}세")
