### 라이브러리 임포트

In [33]:
import pandas as pd
import numpy as np
import random
import os
from catboost import CatBoostRegressor, Pool

### 시드고정

In [34]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed=42
seed_everything(seed) # Seed 고정

### 데이터 불러오기 & column제거

In [35]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_train = df_train.drop(['sessionID','userID'],axis=1)
df_test = df_test.drop(['sessionID','userID'],axis=1)

In [36]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252289 entries, 0 to 252288
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   TARGET               252289 non-null  float64
 1   browser              252289 non-null  object 
 2   OS                   252289 non-null  object 
 3   device               252289 non-null  object 
 4   new                  252289 non-null  int64  
 5   quality              252289 non-null  float64
 6   duration             252289 non-null  float64
 7   bounced              252289 non-null  int64  
 8   transaction          252289 non-null  float64
 9   transaction_revenue  252289 non-null  float64
 10  continent            252289 non-null  object 
 11  subcontinent         252289 non-null  object 
 12  country              252289 non-null  object 
 13  traffic_source       252289 non-null  object 
 14  traffic_medium       252289 non-null  object 
 15  keyword          

### nan값 채우기

In [37]:
df_train.fillna('NAN', inplace=True)
df_test.fillna('NAN', inplace=True)


In [38]:
import re

df_train['keyword'] = df_train['keyword'].str.replace(r'Category(\d+)', r'\1')
df_test['keyword'] = df_test['keyword'].str.replace(r'Category(\d+)', r'\1')

df_train['keyword'] = df_train['keyword'].apply(lambda x: x.split('_')[0] if '_' in x else x)
df_test['keyword'] = df_test['keyword'].apply(lambda x: x.split('_')[0] if '_' in x else x)

df_train['referral_path'] = df_train['referral_path'].str.replace(r'Category(\d+)', r'\1')
df_test['referral_path'] = df_test['referral_path'].str.replace(r'Category(\d+)', r'\1')

df_train['referral_path'] = df_train['referral_path'].apply(lambda x: x.split('_')[0] if '_' in x else x)
df_test['referral_path'] = df_test['referral_path'].apply(lambda x: x.split('_')[0] if '_' in x else x)

  df_train['keyword'] = df_train['keyword'].str.replace(r'Category(\d+)', r'\1')
  df_test['keyword'] = df_test['keyword'].str.replace(r'Category(\d+)', r'\1')
  df_train['referral_path'] = df_train['referral_path'].str.replace(r'Category(\d+)', r'\1')
  df_test['referral_path'] = df_test['referral_path'].str.replace(r'Category(\d+)', r'\1')


In [39]:

df_train['from'] = df_train['keyword'] + '_' + df_train['referral_path']
df_train['from'] = df_train['from'].apply(lambda x: re.sub(r'\D+', '', x) if re.search(r'\d', x) else 'NAN')

df_test['from'] = df_test['keyword'] + '_' + df_test['referral_path']
df_test['from'] = df_test['from'].apply(lambda x: re.sub(r'\D+', '', x) if re.search(r'\d', x) else 'NAN')


In [40]:
# df_train = df_train.drop(['keyword', 'referral_path'], axis=1)
# df_test = df_test.drop(['keyword', 'referral_path'], axis=1)


### object type를 category type로 변경 후 label 분리

In [41]:
categorical_features = [
"browser",
"OS",
"device",
"continent",
"subcontinent",
"country",
"traffic_source",
"traffic_medium",
"from",
'keyword', 
'referral_path'
]
for i in categorical_features:
    df_train[i] = df_train[i].astype('category')
    df_test[i] = df_test[i].astype('category')

x_train = df_train.drop('TARGET', axis=1)
y_train = df_train['TARGET']
train_pool = Pool(data=x_train, label=y_train, cat_features=categorical_features)

### 모델 정의 후 학습

In [42]:
clf = CatBoostRegressor(random_state=seed, verbose=False)
clf.fit(train_pool)

<catboost.core.CatBoostRegressor at 0x7f8dadc8ea60>

### 테스트 데이터 예측

In [43]:
test_pool = Pool(data=df_test, cat_features=categorical_features)
pred = clf.predict(test_pool)
pred

array([27.49708697,  1.00639068,  2.60671601, ...,  2.73514667,
        4.90680834,  1.0174196 ])

### 0보다 작은 값을 0으로 보정

In [44]:
pred = [0 if i < 0 else i for i in pred]
# pred = [round(i) if i < 0 else round(i) for i in pred]


### csv파일에 예측값 입력 후 제출

In [45]:
df_submit = pd.read_csv('sample_submission.csv')
df_submit['TARGET'] = pred

In [46]:
df_submit.head()

Unnamed: 0,sessionID,TARGET
0,SESSION_252289,27
1,SESSION_252290,1
2,SESSION_252291,3
3,SESSION_252292,4
4,SESSION_252293,11


In [47]:
df_submit.to_csv("sample_submission.csv", index=False)