In [30]:
import numpy as np 
import pandas as pd 
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler

user_id → 각 사용자를 식별하는 고유 식별자
성별 → 사용자 성별 (남성/여성/기타)
나이 → 사용자 나이
국가 → 사용자 위치
subscription_type → Spotify 구독 유형 (무료, 프리미엄, 가족, 학생)
청취 시간 → 하루 청취 시간(분)
songs_played_per_day → 하루에 재생된 노래 수
skip_rate → 건너뛴 노래 비율
device_type → 사용된 기기 (모바일, 데스크톱, 웹)
ads_listened_per_week → 주당 청취한 광고 수
offline_listening → 오프라인 모드 사용
is_churned → 목표 변수 (0 = 활성, 1 = 이탈)

In [31]:
# 결측치, 이상치 없음
df = pd.read_csv("data/spotify_churn_dataset.csv")
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   user_id                8000 non-null   int64  
 1   gender                 8000 non-null   object 
 2   age                    8000 non-null   int64  
 3   country                8000 non-null   object 
 4   subscription_type      8000 non-null   object 
 5   listening_time         8000 non-null   int64  
 6   songs_played_per_day   8000 non-null   int64  
 7   skip_rate              8000 non-null   float64
 8   device_type            8000 non-null   object 
 9   ads_listened_per_week  8000 non-null   int64  
 10  offline_listening      8000 non-null   int64  
 11  is_churned             8000 non-null   int64  
dtypes: float64(1), int64(7), object(4)
memory usage: 750.1+ KB


Unnamed: 0,user_id,age,listening_time,songs_played_per_day,skip_rate,ads_listened_per_week,offline_listening,is_churned
count,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0
mean,4000.5,37.662125,154.06825,50.12725,0.300127,6.943875,0.74775,0.258875
std,2309.54541,12.740359,84.015596,28.449762,0.173594,13.617953,0.434331,0.438044
min,1.0,16.0,10.0,1.0,0.0,0.0,0.0,0.0
25%,2000.75,26.0,81.0,25.0,0.15,0.0,0.0,0.0
50%,4000.5,38.0,154.0,50.0,0.3,0.0,1.0,0.0
75%,6000.25,49.0,227.0,75.0,0.45,5.0,1.0,1.0
max,8000.0,59.0,299.0,99.0,0.6,49.0,1.0,1.0


In [32]:
X = df.drop(columns=['user_id','is_churned'])
y = df['is_churned']

In [33]:
# 범주형 인코딩 - drop_first=False로 모델링시 해당 컬럼들 체크해주세요!
cat_cols = ['gender','country','subscription_type','device_type']

X = pd.get_dummies(
    X,
    columns=cat_cols,
    drop_first=False
)

X.head()

Unnamed: 0,age,listening_time,songs_played_per_day,skip_rate,ads_listened_per_week,offline_listening,gender_Female,gender_Male,gender_Other,country_AU,...,country_PK,country_UK,country_US,subscription_type_Family,subscription_type_Free,subscription_type_Premium,subscription_type_Student,device_type_Desktop,device_type_Mobile,device_type_Web
0,54,26,23,0.2,31,0,True,False,False,False,...,False,False,False,False,True,False,False,True,False,False
1,33,141,62,0.34,0,1,False,False,True,False,...,False,False,False,True,False,False,False,False,False,True
2,38,199,38,0.04,0,1,False,True,False,True,...,False,False,False,False,False,True,False,False,True,False
3,22,36,2,0.31,0,1,True,False,False,False,...,False,False,False,False,False,False,True,False,True,False
4,29,250,57,0.36,0,1,False,False,True,False,...,False,False,True,True,False,False,False,False,True,False


In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

# 수치형 스케일링
num_cols = [
    'age',
    'listening_time',
    'songs_played_per_day',
    'skip_rate',
    'ads_listened_per_week',
    'offline_listening'
]

scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])