### Import

In [7]:
import pandas as pd
import numpy as np
import random
import os
from datetime import datetime

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Lasso

### Fixed Random-Seed

In [8]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

### Load Data

In [9]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [10]:
train = train.drop(columns=['sessionID','userID'],axis=1)
test = test.drop(columns=['sessionID','userID'],axis=1)

In [18]:
display(train.head(5))
train.info()

Unnamed: 0,TARGET,browser,OS,device,new,quality,duration,bounced,transaction,transaction_revenue,continent,subcontinent,country,traffic_source,traffic_medium,keyword,referral_path
0,17.0,Chrome,Macintosh,desktop,0,45.0,839.0,0,0.0,0.0,Americas,Northern America,United States,google,organic,Category8,
1,3.0,Chrome,Windows,desktop,1,1.0,39.0,0,0.0,0.0,Europe,Western Europe,Germany,google,organic,Category8,
2,1.0,Samsung Internet,Android,mobile,1,1.0,0.0,1,0.0,0.0,Asia,Southeast Asia,Malaysia,(direct),(none),,
3,1.0,Chrome,Macintosh,desktop,1,1.0,0.0,1,0.0,0.0,Americas,Northern America,United States,Partners,affiliate,,
4,1.0,Chrome,iOS,mobile,0,1.0,0.0,1,0.0,0.0,Americas,Northern America,United States,groups.google.com,referral,,Category6_Path_0000


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252289 entries, 0 to 252288
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   TARGET               252289 non-null  float64
 1   browser              252289 non-null  object 
 2   OS                   252289 non-null  object 
 3   device               252289 non-null  object 
 4   new                  252289 non-null  int64  
 5   quality              252289 non-null  float64
 6   duration             252289 non-null  float64
 7   bounced              252289 non-null  int64  
 8   transaction          252289 non-null  float64
 9   transaction_revenue  252289 non-null  float64
 10  continent            252289 non-null  object 
 11  subcontinent         252289 non-null  object 
 12  country              252289 non-null  object 
 13  traffic_source       252289 non-null  object 
 14  traffic_medium       252289 non-null  object 
 15  keyword          

In [17]:
train['browser']

for col in train[['browser']].columns:

    print(train[col].value_counts())

Chrome                                                  174395
Safari                                                   35689
Android Webview                                          13855
Firefox                                                   8436
Internet Explorer                                         4573
                                                         ...  
eosutpkiahjzvdgcwxlmyfqbrn                                   1
;__CT_JOB_ID__:76fd1acb-e365-43c0-b967-908bcf5d5b59;         1
wvsmagudcqeytijorlhxnfzkbp                                   1
efkaxnbyohqtspzlvcwrjmigdu                                   1
NokiaC7-00                                                   1
Name: browser, Length: 70, dtype: int64


In [19]:
for col in train[['OS', 'device', 'continent', 'subcontinent', 'country', 'traffic_source', 'traffic_medium', 'keyword', 'referral_path']].columns:

    print(train[col].value_counts())

Windows             88119
Macintosh           60316
Android             51870
iOS                 29778
Linux                9562
Chrome OS            9222
(not set)            2592
Tizen                 321
Samsung               259
Windows Phone         128
OS/2                   36
Xbox                   31
BlackBerry             29
Playstation Vita        9
Nintendo 3DS            6
Firefox OS              4
FreeBSD                 3
SunOS                   1
Nintendo WiiU           1
SymbianOS               1
Nokia                   1
Name: OS, dtype: int64
desktop    167133
mobile      76460
tablet       8696
Name: device, dtype: int64
Americas     126506
Asia          57254
Europe        53748
Africa        10514
Oceania        3931
(not set)       336
Name: continent, dtype: int64
Northern America      111945
Southern Asia          21744
Western Europe         18325
Northern Europe        15432
Southeast Asia         13977
Eastern Asia           13598
Southern Europe        107

### 데이터 전처리 1 : 결측값 대체

In [None]:
# traffic_medium

# organic      107370
# referral      70047
# (none)        59022
# cpc            9978
# affiliate      5365
# cpm             501
# (not set)         6

# traffic_medium : organic == referral_path X, keyword 있는것도 잇음
# traffic_medium : refferal == keyword X, referral_path O
# traffic_medium : (none) == keyword X, referral_path 있는것도 있음 / traffic_source : (direct)
# traffic_medium : cpc == referral_path X, keyword 있는것도 잇음
# traffic_medium : affiliate, cpm, (not set)== keyword X, referral_path X


 

In [4]:
# train 데이터셋에서 결측값 확인
train_missing_values = train.isnull().sum()
print("Train 데이터셋의 결측값:")
print(train_missing_values[train_missing_values > 0])  # 결측값이 있는 열만 출력

Train 데이터셋의 결측값:
keyword          137675
referral_path    161107
dtype: int64


In [5]:
train['keyword'].fillna(train['keyword'].mode()[0], inplace=True)
train['referral_path'].fillna(train['referral_path'].mode()[0], inplace=True)

In [6]:
test['keyword'].fillna(train['keyword'].mode()[0], inplace=True)
test['referral_path'].fillna(train['referral_path'].mode()[0], inplace=True)

### 데이터 전처리 2: Label 인코딩 및 변수 제거

In [7]:
categorical_features = list(train.dtypes[train.dtypes == "object"].index)

for i in categorical_features:
    count = train[i].nunique()  # 고유값의 개수를 계산
    print(f"{i}: {count}")

sessionID: 252289
userID: 206024
browser: 70
OS: 21
device: 3
continent: 6
subcontinent: 23
country: 205
traffic_source: 168
traffic_medium: 7
keyword: 623
referral_path: 1578


In [9]:
encoding_target = list(train.dtypes[train.dtypes == "object"].index)


for i in encoding_target:
    le = LabelEncoder()
    le.fit(train[i])
    train[i] = le.transform(train[i])
    
    # test 데이터의 새로운 카테고리에 대해 le.classes_ 배열에 추가
    # test 데이터에 대해서 직접적으로 fit을 수행할 경우 Data Leakage
    for case in np.unique(test[i]):
        if case not in le.classes_: 
            le.classes_ = np.append(le.classes_, case) 
    
    test[i] = le.transform(test[i])

### x와 y 설정

In [10]:
train_x = train.drop(columns=['TARGET'])
train_y = train['TARGET']

test_x = test

### 모델학습 및 예측

In [11]:
lasso = Lasso(alpha=0.1).fit(train_x,train_y)
preds = lasso.predict(test_x)

### Submission

In [12]:
submission = pd.read_csv('./sample_submission.csv')
submission

Unnamed: 0,sessionID,TARGET
0,SESSION_252289,0
1,SESSION_252290,0
2,SESSION_252291,0
3,SESSION_252292,0
4,SESSION_252293,0
...,...,...
79781,SESSION_332070,0
79782,SESSION_332071,0
79783,SESSION_332072,0
79784,SESSION_332073,0


In [13]:
submission['TARGET'] = preds
submission

Unnamed: 0,sessionID,TARGET
0,SESSION_252289,22.826651
1,SESSION_252290,1.362503
2,SESSION_252291,3.512813
3,SESSION_252292,4.196410
4,SESSION_252293,5.924896
...,...,...
79781,SESSION_332070,1.424369
79782,SESSION_332071,1.439631
79783,SESSION_332072,4.137534
79784,SESSION_332073,4.521409


In [14]:
submission.to_csv('./baseline_submission.csv', index=False)