### 라이브러리 임포트

In [1]:
import pandas as pd
import numpy as np
import random
import os
from catboost import CatBoostRegressor, Pool

### 시드고정

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed=42
seed_everything(seed) # Seed 고정

### 데이터 불러오기 & column제거

In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_train = df_train.drop(['sessionID','userID'],axis=1)
df_test = df_test.drop(['sessionID','userID'],axis=1)

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252289 entries, 0 to 252288
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   TARGET               252289 non-null  float64
 1   browser              252289 non-null  object 
 2   OS                   252289 non-null  object 
 3   device               252289 non-null  object 
 4   new                  252289 non-null  int64  
 5   quality              252289 non-null  float64
 6   duration             252289 non-null  float64
 7   bounced              252289 non-null  int64  
 8   transaction          252289 non-null  float64
 9   transaction_revenue  252289 non-null  float64
 10  continent            252289 non-null  object 
 11  subcontinent         252289 non-null  object 
 12  country              252289 non-null  object 
 13  traffic_source       252289 non-null  object 
 14  traffic_medium       252289 non-null  object 
 15  keyword          

### nan값 채우기

In [5]:
df_train.fillna('NAN', inplace=True)
df_test.fillna('NAN', inplace=True)


### object type를 category type로 변경 후 label 분리

In [6]:
categorical_features = [
"browser",
"OS",
"device",
"continent",
"subcontinent",
"country",
"traffic_source",
"traffic_medium",
"keyword",
"referral_path",
]
for i in categorical_features:
    df_train[i] = df_train[i].astype('category')
    df_test[i] = df_test[i].astype('category')

x_train = df_train.drop('TARGET', axis=1)
y_train = df_train['TARGET']
train_pool = Pool(data=x_train, label=y_train, cat_features=categorical_features)

### 모델 정의 후 학습

In [7]:
clf = CatBoostRegressor(random_state=seed, verbose=False)
clf.fit(train_pool)

<catboost.core.CatBoostRegressor at 0x7f7ac11d70d0>

### 테스트 데이터 예측

In [8]:
test_pool = Pool(data=df_test, cat_features=categorical_features)
pred = clf.predict(test_pool)
pred

array([27.76413837,  1.01639991,  2.67552931, ...,  2.74967538,
        4.95644313,  1.02338569])

### 0보다 작은 값을 0으로 보정

In [9]:
pred = [0 if i < 0 else i for i in pred]

### csv파일에 예측값 입력 후 제출

In [10]:
df_submit = pd.read_csv('sample_submission.csv')
df_submit['TARGET'] = pred

In [11]:
df_submit.head()

Unnamed: 0,sessionID,TARGET
0,SESSION_252289,27.764138
1,SESSION_252290,1.0164
2,SESSION_252291,2.675529
3,SESSION_252292,4.327607
4,SESSION_252293,10.138248


In [12]:
df_submit.to_csv("sample_submission.csv", index=False)