# Simple lightgbm notebook

## If you find this notebook useful, please upvote it.

### Import library

In [None]:
import os
from glob import glob
from pathlib import Path

import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import KFold

In [None]:
# https://github.com/nyk510/vivid/blob/master/vivid/utils.py
from contextlib import contextmanager
from time import time

class Timer:
    def __init__(self, logger=None, format_str="{:.3f}[s]", prefix=None, suffix=None, sep=" "):

        if prefix: format_str = str(prefix) + sep + format_str
        if suffix: format_str = format_str + sep + str(suffix)
        self.format_str = format_str
        self.logger = logger
        self.start = None
        self.end = None

    @property
    def duration(self):
        if self.end is None:
            return 0
        return self.end - self.start

    def __enter__(self):
        self.start = time()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.end = time()
        out_str = self.format_str.format(self.duration)
        if self.logger:
            self.logger.info(out_str)
        else:
            print(out_str)

### Read sample csv

In [None]:
BASE_DIR = Path('../input/smartphone-decimeter-2022')
df_sample_submission = pd.read_csv(BASE_DIR / "sample_submission.csv")

### Make dataset

In [None]:
train_folders = glob(str(BASE_DIR / "train/*/*"))

X = []
y_LatitudeDegrees = []
y_LongitudeDegrees = []

for train_folder in tqdm(train_folders):
    df_device_gnss = pd.read_csv( f"{train_folder}/device_gnss.csv").rename(columns={'utcTimeMillis': 'UnixTimeMillis'})
    df_device_gnss = df_device_gnss.groupby("UnixTimeMillis").mean()
    
    df_ground_truth = pd.read_csv( f"{train_folder}/ground_truth.csv", usecols=['UnixTimeMillis', 'LatitudeDegrees', 'LongitudeDegrees'])
    
    df_merged = pd.merge(df_device_gnss, df_ground_truth, on="UnixTimeMillis", how="left")
    
    X.append(df_merged.drop(columns=['LatitudeDegrees', 'LongitudeDegrees']))
    y_LatitudeDegrees.append(df_merged['LatitudeDegrees'])
    y_LongitudeDegrees.append(df_merged['LongitudeDegrees'])
    
# Concat df
X = pd.concat(X).values
y_LatitudeDegrees = pd.concat(y_LatitudeDegrees).values
y_LongitudeDegrees = pd.concat(y_LongitudeDegrees).values

In [None]:
test_folders = glob(str(BASE_DIR / "test/*/*"))

test_index = []
test_X = []
test_y_LatitudeDegrees = []
test_y_LongitudeDegrees = []

for test_folder in tqdm(test_folders):
    df_device_gnss = pd.read_csv( f"{test_folder}/device_gnss.csv").rename(columns={'utcTimeMillis': 'UnixTimeMillis'})
    df_device_gnss = df_device_gnss.groupby("UnixTimeMillis").mean()
    
    dir_name, device_name = os.path.split(test_folder)
    _, id = os.path.split(dir_name)
    
    df_sample_by_tripId = df_sample_submission[df_sample_submission["tripId"] == f"{id}/{device_name}"]
    df_merged = pd.merge(df_device_gnss, df_sample_by_tripId.drop(columns=["tripId"]), on="UnixTimeMillis", how="left")
    
    df_index = df_merged[["UnixTimeMillis"]]
    df_index["tripId"] = f"{id}/{device_name}"
    
    test_index.append(df_index)
    test_X.append(df_merged.drop(columns=['LatitudeDegrees', 'LongitudeDegrees']))
    test_y_LatitudeDegrees.append(df_merged['LatitudeDegrees'])
    test_y_LongitudeDegrees.append(df_merged['LongitudeDegrees'])
    
# Concat df
test_index = pd.concat(test_index).reset_index(drop=True)
test_X = pd.concat(test_X).values
test_y_LatitudeDegrees = pd.concat(test_y_LatitudeDegrees).values
test_y_LongitudeDegrees = pd.concat(test_y_LongitudeDegrees).values

### Data split

In [None]:
fold = KFold(n_splits=5, shuffle=True, random_state=510)
cv = fold.split(X, y_LatitudeDegrees)
cv = list(cv)

### lightgbm

In [None]:
from sklearn.metrics import mean_squared_error
import lightgbm as lgbm
import numpy as np

def fit_lgbm(X, 
             y, 
             cv, 
             params: dict=None, 
             verbose: int=50):

    if params is None:
        params = {}

    models = []
    n_records = len(X)
    oof_pred = np.zeros((n_records,), dtype=np.float32)

    for i, (idx_train, idx_valid) in enumerate(cv): 
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]

        clf = lgbm.LGBMRegressor(**params)

        with Timer(prefix="fit fold={} ".format(i)):
            clf.fit(x_train, y_train, 
                    eval_set=[(x_valid, y_valid)],  
                    early_stopping_rounds=100,
                    verbose=verbose)
            
        pred_i = clf.predict(x_valid)
        oof_pred[idx_valid] = pred_i
        models.append(clf)
        score = mean_squared_error(y_valid, pred_i)
        print(f" - fold{i + 1} - {score:.10f}")

    score = mean_squared_error(y, oof_pred)
    print(f"{score:.10f}")
    return oof_pred, models

In [None]:
oof_LatitudeDegrees, models_LatitudeDegrees = fit_lgbm(X=X, y=y_LatitudeDegrees, cv=cv)
oof_LongitudeDegrees, models_LongitudeDegrees = fit_lgbm(X=X, y=y_LongitudeDegrees, cv=cv)

### Predict

In [None]:
pred_LatitudeDegrees = np.mean(np.array([model.predict(test_X) for model in models_LatitudeDegrees]), axis=0)
pred_LongitudeDegrees = np.mean(np.array([model.predict(test_X) for model in models_LongitudeDegrees]), axis=0)
df_res = pd.DataFrame({
    "LatitudeDegrees": pred_LatitudeDegrees,
    "LongitudeDegrees": pred_LongitudeDegrees
})
df_res = pd.concat([test_index, df_res], axis=1)

### Save submission.csv

In [None]:
df_res = df_res.reindex(columns=['tripId', 'UnixTimeMillis', 'LatitudeDegrees', 'LongitudeDegrees']).sort_values(["tripId", "UnixTimeMillis"]).reset_index(drop=True).to_csv("submission.csv", index=False)

# Thanks for reading!