In [1]:
import numpy as np
import pandas as pd
import json

import datetime
import holidays

import catboost
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split

from tqdm import tqdm

from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="my_request")

In [11]:
train = pd.read_csv('../input/ad-tech/train.csv').columns
train

In [2]:
with open('../input/city-coords/city_coords.json', 'r') as f:
    cities_coords = json.load(f)
    
def get_cities_latitide(x):
    try:
        return cities_coords[x][0]
    except:
        return 0
    
def get_cities_longitude(x):
    try:
        return cities_coords[x][1]
    except:
        return 0

In [3]:
def get_coordinates_from_city_str(df, column: str='city'):
    df['latitude'] = df[column].apply(get_cities_latitide)
    df['longitude'] = df[column].apply(get_cities_longitude)
    return df


def drop_useless_columns(df, columns: list):
    try:
        return df.drop(columns, axis=1)
    except:
        pass


def filling_category_data(df, columns: list):
    for column in columns:
        df = df.fillna({column:train[column].mode()[0]})
    return df


def filling_digital_data(df, column: list):
    for column in columns:
        df = df.fillna({column:train[column].mean()[0]})
    return df


def get_spreed_time(df, column: str='created'):
    
    df[column] = df[column].apply(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
    
    df['year'] = df['created'].apply(lambda x: x.year)
    df['month'] = df['created'].apply(lambda x: x.month)
    df['day'] = df['created'].apply(lambda x: x.day)
    
    df['hour'] = df['created'].apply(lambda x: x.hour)
    df['minute'] = df['created'].apply(lambda x: x.minute)
    df['second'] = df['created'].apply(lambda x: x.second)
    
    df['is_weekend'] = df[column].apply(lambda x: 1 if x.weekday() in (5, 6) else 0)
    
    df['is_holiday'] = df[column].apply(lambda x: 1 if x in holidays.Russia() else 0)
    
    return df


def categorical_data_to_string(df, columns: list):
    for column in columns:
        df[column] = df[column].astype('str')
    return df

In [4]:
def preprocess_dataframe(train: pd.DataFrame, test_mode: bool=False):
    train = filling_category_data(train, ['gamecategory', 'subgamecategory', 'bundle', 'oblast', 'city'])
    print('----')
    train = get_spreed_time(train)
    print('----')
    train = get_coordinates_from_city_str(train)
    print('----')
    train = drop_useless_columns(train, ['shift', 'created'])
    print('----')
    train = categorical_data_to_string(train, train.columns[1:-2].tolist())
    if test_mode:
        train.drop('Segment', axis=1, inplace=True)
    return train

In [23]:
train = preprocess_dataframe(train)
train

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train.drop(['Segment'], axis=1),
                                                    train['Segment'],
                                                    test_size=0.2, random_state=0)

cat_features = train.columns[1:-2].tolist()

train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)

In [None]:
model = CatBoostClassifier(iterations=600000,
                           depth=2,
                           learning_rate=0.001,
                           loss_function='MultiClass',
                           eval_metric='Accuracy',
                           verbose=True,
                           task_type='GPU')

model.fit(train_pool,
          eval_set=test_pool,
          verbose=1000 
         )

In [None]:
model.save_model('catboost_model.cbm', format='cbm')