# Delta Prediction

This notebook demonstrates one of our team's solution methods,"Delta Prediction.

## First  
I got a lot of advice on this solution from my teammates [@tomo20180402](https://www.kaggle.com/tomo20180402),[@Masaya](https://www.kaggle.com/irrohas) and [@koji](https://www.kaggle.com/minomonter).  
Thank you for your help in creating this solution  

## Overview
This is a model of Light GBM with the objective function of the difference from ground_truth (delta_latDeg,delta_lngDeg).  
It was an effective method for downtown and trees, but it did not work well for highways.  
Therefore, it was very powerful in Public, but not so effective in Private.

### Score  
* Public : 　6.665 → 5.162　 ▲1.503  
* Private: 　5.098 → 4.687 　▲0.411  
* CV　　: 　4.879 → 4.215 　▲0.664

### Some Point  
* Examples of feature values  
  1. Location information of baseline_data
  2. Difference and statistics between previous and next point
  3. phoneName
* We applied this process a total of two times in the pipeline.  
  1. The first one is before post-processing such as kf and phone_mean.  
  1. The second one is after post-processing.
* We divided the phone into 5 parts and created a CV. Actually,we wanted to make a CV with "collectionName", but it didn't get a good score.  
  This CV may have caused to overlearning.


In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import copy
import plotly.express as px
import plotly.graph_objects as go
import pyproj
import json
import bisect
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
from matplotlib.colors import ListedColormap, BoundaryNorm
import pickle
import random
from tqdm.notebook import tqdm
import lightgbm as lgb
import warnings
warnings.simplefilter('ignore')
pd.set_option('display.max_rows',30)
pd.set_option('display.max_columns',None)

In [None]:
def calc_haversine(lat1, lon1, lat2, lon2):
    RADIUS = 6_367_000
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    dist = 2 * RADIUS * np.arcsin(a**0.5)
    return dist

def percentile50(x):
    return np.percentile(x, 50)
def percentile95(x):
    return np.percentile(x, 95)

def get_train_score(df, gt):
    gt = gt.rename(columns={'latDeg':'latDeg_gt', 'lngDeg':'lngDeg_gt'})
    df = df.merge(gt, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='inner')
    # calc_distance_error
    df['err'] = calc_haversine(df['latDeg_gt'], df['lngDeg_gt'], df['latDeg'], df['lngDeg'])
    # calc_evaluate_score
    df['phone'] = df['collectionName'] + '_' + df['phoneName']
    res = df.groupby('phone')['err'].agg([percentile50, percentile95])
    res['p50_p90_mean'] = (res['percentile50'] + res['percentile95']) / 2 
    score = res['p50_p90_mean'].mean()
    return score

In [None]:
def eval_all(df_pred, df_gt):
    scores = []
    compared_cols = ["latDeg_truth","lngDeg_truth","latDeg_pred","lngDeg_pred"]
    collections = sorted(df_gt['collectionName'].unique())
    for collection in collections:
        df_pred_col = df_pred[df_pred['collectionName'] == collection]
        df_gt_col = df_gt[df_gt['collectionName'] == collection]
        
        score = get_train_score(df_pred_col, df_gt_col)
        
        df_merged = pd.merge_asof(df_gt_col.sort_values('millisSinceGpsEpoch'), df_pred_col.sort_values('millisSinceGpsEpoch'), 
                                  on="millisSinceGpsEpoch", by=["collectionName", "phoneName"], 
                                  direction='nearest',tolerance=100000, suffixes=('_truth', '_pred'))
        df_merged = df_merged.sort_values(by=["collectionName", "phoneName", "millisSinceGpsEpoch"], ignore_index=True)

        haversine = calc_haversine(*df_merged[compared_cols].to_numpy().transpose()).mean()
        scores.append([collection, haversine, score])
    
    score = get_train_score(df_pred, df_gt)
    df_merged = pd.merge_asof(df_gt.sort_values('millisSinceGpsEpoch'), df_pred.sort_values('millisSinceGpsEpoch'), 
                              on="millisSinceGpsEpoch", by=["collectionName", "phoneName"], 
                              direction='nearest',tolerance=100000, suffixes=('_truth', '_pred'))
    haversine = calc_haversine(*df_merged[compared_cols].to_numpy().transpose()).mean()
    scores.append(['all', haversine, score])
    
    df_scores = pd.DataFrame(scores, columns=['collection', 'haversine', 'score'])
    return df_scores

In [None]:
def get_features(df):
    
    for i in range(1,51):
        df[f"latDeg_dist_lag{i}"] = (df["latDeg"] - df.groupby(["collectionName","phoneName"])["latDeg"].shift(i)).fillna(0)
        df[f"lngDeg_dist_lag{i}"] = (df["lngDeg"] - df.groupby(["collectionName","phoneName"])["lngDeg"].shift(i)).fillna(0)
        df[f"latDeg_dist_lead{i}"] = (df["latDeg"] - df.groupby(["collectionName","phoneName"])["latDeg"].shift(-i)).fillna(0)
        df[f"lngDeg_dist_lead{i}"] = (df["lngDeg"] - df.groupby(["collectionName","phoneName"])["lngDeg"].shift(-i)).fillna(0)

    for i in range(1,26):
        df[f"latDeg_dist_laglead{i}_mean"] = df[[f"latDeg_dist_lag{num}" for num in range(1,i+1)] + [f"latDeg_dist_lead{num}" for num in range(1,i+1)]].mean(axis=1)
        df[f"lngDeg_dist_laglead{i}_mean"] = df[[f"lngDeg_dist_lag{num}" for num in range(1,i+1)] + [f"lngDeg_dist_lead{num}" for num in range(1,i+1)]].mean(axis=1)
        df[f"latDeg_dist_laglead{i}_median"] = df[[f"latDeg_dist_lag{num}" for num in range(1,i+1)] + [f"latDeg_dist_lead{num}" for num in range(1,i+1)]].median(axis=1)
        df[f"lngDeg_dist_laglead{i}_median"] = df[[f"lngDeg_dist_lag{num}" for num in range(1,i+1)] + [f"lngDeg_dist_lead{num}" for num in range(1,i+1)]].median(axis=1)

    for i in range(2,51):
        for col in ["lag","lead"]:
            df[f"latDeg_dist_{col}{i}_mean"] = df[[f"latDeg_dist_{col}{num}" for num in range(1,i+1)]].mean(axis=1)
            df[f"lngDeg_dist_{col}{i}_mean"] = df[[f"lngDeg_dist_{col}{num}" for num in range(1,i+1)]].mean(axis=1)
            df[f"latDeg_dist_{col}{i}_median"] = df[[f"latDeg_dist_{col}{num}" for num in range(1,i+1)]].median(axis=1)
            df[f"lngDeg_dist_{col}{i}_median"] = df[[f"lngDeg_dist_{col}{num}" for num in range(1,i+1)]].median(axis=1)
    
    return df


def get_haversine_kf(df_before, df_after):
    df_before = df_before.rename(columns={"latDeg":"latDeg_before", "lngDeg":"lngDeg_before"})
    df_after = df_after.rename(columns={"latDeg":"latDeg_after", "lngDeg":"lngDeg_after"})
    df = pd.merge(df_before, df_after, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'phone'])
        
    df["delta_lat_kf"] = df["latDeg_after"] - df["latDeg_before"]
    df["delta_lng_kf"] = df["lngDeg_after"] - df["lngDeg_before"]
        
    add_features = ["delta_lat_kf", "delta_lng_kf"] 
    
    df = df.fillna(0)
    df = df[['collectionName', 'phoneName', 'millisSinceGpsEpoch'] + add_features]
    
    return df

In [None]:
class FeatureGenerator:
    def __init__(self, tomo_features):
        self.tomo_features = tomo_features
        self.bins_lat = None
        self.bins_lng = None
        self.original_lat_cols = None
        self.original_lng_cols = None
        self.original_phone_cols = None

    def generate(self, df_input, routes, is_train=True):
        df = df_input[df_input['collectionName'].isin(routes)]
        if is_train:
            drop_index = df[df['dist'] > 100].index
            df = df.drop(drop_index).reset_index(drop=True)

            df = df.rename(columns={'latDeg_basepred': 'latDeg'})
            df = df.rename(columns={'lngDeg_basepred': 'lngDeg'})

            _, self.bins_lat = pd.cut(df['latDeg'], bins=1000, retbins=True)
            _, self.bins_lng = pd.cut(df['lngDeg'], bins=1000, retbins=True)
       
        df['lat_flg'] = pd.cut(df['latDeg'], self.bins_lat, labels=False)
        df['lng_flg'] = pd.cut(df['lngDeg'], self.bins_lng, labels=False)

        if is_train:
            df['lat_flg_str'] = 'lat' + df['lat_flg'].astype(str)
            df['lng_flg_str'] = 'lng' + df['lng_flg'].astype(str)
            df_lat_flg_categorized = pd.get_dummies(df['lat_flg_str'])
            df_lng_flg_categorized = pd.get_dummies(df['lng_flg_str'])
            df_phone_categorized = pd.get_dummies(df['phoneName'])
            self.original_lat_cols = df_lat_flg_categorized.columns
            self.original_lng_cols = df_lng_flg_categorized.columns
            self.original_phone_cols = df_phone_categorized.columns
        else:
            lat_flg_str = 'lat' + df['lat_flg'][~np.isnan(df['lat_flg'])].astype(np.int64).astype(str)
            lng_flg_str = 'lng' + df['lng_flg'][~np.isnan(df['lng_flg'])].astype(np.int64).astype(str)
            df_lat_flg_categorized = pd.get_dummies(lat_flg_str)
            df_lng_flg_categorized = pd.get_dummies(lng_flg_str)
            df_phone_categorized = pd.get_dummies(df['phoneName'])

            missing_lat_cols = set(self.original_lat_cols) - set(df_lat_flg_categorized.columns)
            missing_lng_cols = set(self.original_lng_cols) - set(df_lng_flg_categorized.columns)
            missing_phone_cols = set(self.original_phone_cols) - set(df_phone_categorized.columns)
            redundant_phone_cols = set(df_phone_categorized.columns) - set(self.original_phone_cols)
            df_lng_flg_categorized.loc[:, missing_lng_cols] = 0
            df_lat_flg_categorized.loc[:, missing_lat_cols] = 0
            df_phone_categorized.loc[:, missing_phone_cols] = 0
            df_phone_categorized = df_phone_categorized.drop(redundant_phone_cols, axis=1)

            df_lat_flg_categorized = df_lat_flg_categorized.merge(
                df, left_index=True, right_index=True, how='right').fillna(0)[self.original_lat_cols]
            df_lng_flg_categorized = df_lng_flg_categorized.merge(
                df, left_index=True, right_index=True, how='right').fillna(0)[self.original_lng_cols]

        df_list = [
            df[['collectionName', 'phoneName', 'millisSinceGpsEpoch']],
            df[['latDeg', 'lngDeg']],
            df_phone_categorized,
            df_lat_flg_categorized,
            df_lng_flg_categorized,
            df[self.tomo_features],
        ]
        if is_train:
            df_list.append(df[['delta_lat', 'delta_lng', 'fold']])

        df = pd.concat(df_list, axis = 1)
        df['flg'] = 'train' if is_train else 'test'
        df = df.fillna(0)

        return df

In [None]:
class DeltaPredict:
    def __init__(self, lgbm_params, df_gt, df_train, df_test, is_second=False, apply_pseudo_label=False,
                 df_pred_test=None, df_pseudo_label_test=None, early_stopping_rounds=None):
        self.base_features = ['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'fold', 'delta_lat', 'delta_lng', 'flg']

        self.df_gt = df_gt
        self.lgbm_params = lgbm_params
        self.early_stopping_rounds = early_stopping_rounds

        self.apply_pseudo_label = apply_pseudo_label
        self.df_pred_test = df_pred_test
        self.df_pseudo_label_test = df_pseudo_label_test
        
        if is_second:
            self.df_train_tomo = get_features(df_train).drop(['latDeg', 'lngDeg', 'phone'], axis=1)
            self.df_test_tomo = get_features(df_test).drop(['latDeg', 'lngDeg', 'phone'], axis=1)
        else:
            self.df_train_tomo = df_train.drop(['latDeg', 'lngDeg', 'phone'], axis=1)
            self.df_test_tomo = df_test.drop(['latDeg', 'lngDeg', 'phone'], axis=1)

        self.tomo_features = list(set(self.df_train_tomo.columns) - set([
            'collectionName',
            'phoneName',
            'millisSinceGpsEpoch',
            'heightAboveWgs84EllipsoidM',
            'latDeg_lag1',
            'lngDeg_lag1',
        ]))

        self.routes_dict = {
            'tree_1': tree_1,
            'downtown': downtowns
        }
        self.feature_generator_dict = {
            'tree_1': FeatureGenerator(self.tomo_features),
            'downtown': FeatureGenerator(self.tomo_features),
        }
        self.models_dict_dict = {}

    def train_model(self, df_input, routes_name):
        routes = self.routes_dict[routes_name]
        df_train = self.feature_generator_dict[routes_name].generate(df_input, routes, is_train=True)

        df_valid = pd.DataFrame()
        models_dict = {}

        for fold in df_train.fold.unique():
            X_train = df_train[df_train['fold'] != fold].drop(self.base_features, axis=1)
            y_train = df_train[df_train['fold'] != fold][['delta_lat', 'delta_lng']]

            X_valid  = df_train[df_train['fold'] == fold].drop(self.base_features, axis=1)
            y_valid  = df_train[df_train['fold'] == fold][['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg', 'delta_lat','delta_lng']]

            lgb_train_lat = lgb.Dataset(X_train, y_train.delta_lat)
            lgb_eval_lat = lgb.Dataset(X_valid, y_valid.delta_lat)
            model_lat = lgb.train(self.lgbm_params,
                              lgb_train_lat,
                              valid_sets=[lgb_train_lat, lgb_eval_lat],
                              early_stopping_rounds=self.early_stopping_rounds,
                              verbose_eval=-1)
            delta_lat_valid = model_lat.predict(X_valid)

            lgb_train_lng = lgb.Dataset(X_train, y_train.delta_lng)
            lgb_eval_lng = lgb.Dataset(X_valid, y_valid.delta_lng)
            model_lng = lgb.train(self.lgbm_params,
                              lgb_train_lng,
                              valid_sets=[lgb_train_lng, lgb_eval_lng],
                              early_stopping_rounds=self.early_stopping_rounds,
                              verbose_eval=-1)
            delta_lng_valid = model_lng.predict(X_valid)

            y_valid['delta_lat'] = delta_lat_valid
            y_valid['delta_lng'] = delta_lng_valid
            df_valid = pd.concat([df_valid, y_valid])

            models_dict[fold] = {}
            models_dict[fold]['lat'] = model_lat
            models_dict[fold]['lng'] = model_lng
        return models_dict, df_valid

    def predict_test_data(self, df_input, routes_name):
        routes = self.routes_dict[routes_name]
        df_test = self.feature_generator_dict[routes_name].generate(df_input, routes, is_train=False)
        
        df_delta_test = df_test[['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']]

        models_dict = self.models_dict_dict[routes_name]

        for fold in models_dict.keys():
            X_test = df_test.drop(['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'flg'], axis=1)
            delta_lat_test_l  = models_dict[fold]['lat'].predict(X_test)

            delta_lng_test_l = models_dict[fold]['lng'].predict(X_test)

            delta_lat_test = delta_lat_test_l
            delta_lng_test = delta_lng_test_l

            df_delta_test[f'delta_lat_{fold}'] = delta_lat_test
            df_delta_test[f'delta_lng_{fold}'] = delta_lng_test

        df_delta_test['delta_lat'] = (df_delta_test['delta_lat_fold_1'] + 
                                      df_delta_test['delta_lat_fold_2'] + 
                                      df_delta_test['delta_lat_fold_3'] + 
                                      df_delta_test['delta_lat_fold_4'] + 
                                      df_delta_test['delta_lat_fold_5']
                                             ) / 5
        df_delta_test['delta_lng'] = (df_delta_test['delta_lng_fold_1'] +
                                      df_delta_test['delta_lng_fold_2'] +
                                      df_delta_test['delta_lng_fold_3'] +
                                      df_delta_test['delta_lng_fold_4'] +
                                      df_delta_test['delta_lng_fold_5']
                                             ) / 5

        df_delta_test = df_delta_test[['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'delta_lat', 'delta_lng']]
        return df_delta_test

    def train_and_predict(self, df_input):
        df = df_input[['phone', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg', 'collectionName', 'phoneName']].copy()
        df_train_epoch = df_input[['collectionName', 'phoneName', 'millisSinceGpsEpoch']]

        if self.apply_pseudo_label:
            df_all = pd.concat([df, self.df_pred_test], axis=0)
            df_gt_all = pd.concat([self.df_gt, self.df_pseudo_label_test], axis=0)
            df_tomo_all = pd.concat([self.df_train_tomo, self.df_test_tomo], axis=0)
        else:
            df_all = df
            df_gt_all = self.df_gt
            df_tomo_all = self.df_train_tomo

        df_all = df_all.merge(df_gt_all[['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']],
                      on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], 
                      how='left', suffixes=['_basepred', '_truth'])
        df_all['dist'] = calc_haversine(*df_all[['latDeg_basepred', 'lngDeg_basepred', 'latDeg_truth', 'lngDeg_truth']].values.T)
        df_all['delta_lat'] = df_all['latDeg_truth'] - df_all['latDeg_basepred']
        df_all['delta_lng'] = df_all['lngDeg_truth'] - df_all['lngDeg_basepred']
        df_all = df_all.merge(df_tomo_all, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='left')
        df_all['phone'] = df_all['collectionName'] + '_' + df_all['phoneName']
        random.seed(0)
        
        fold_df = pd.read_csv("../input/get-cross-validation-set/fold.csv")[['collectionName', 'phoneName', 'millisSinceGpsEpoch',f'fold__{target_set}']]
        fold_df = fold_df.rename(columns={f"fold__{target_set}":"fold"})
        df_all = df_all.merge(fold_df, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how="left")

        regions = [
            'downtown', 
            'tree_1', 
        ]
        valid_list = []
        for region in tqdm(regions, desc='LightGBM (Training & Inference)'):
            print("***" + region + "***")
            self.models_dict_dict[region], df_pred_valid_region = self.train_model(df_all, region)
            valid_list.append(df_pred_valid_region)

        df_all = df_all.rename(columns={'latDeg_basepred': 'latDeg', 'lngDeg_basepred': 'lngDeg'})
        for df_delta in valid_list:
            df_tmp = df_all.merge(
                df_delta, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg'], 
                how='left', suffixes=['_origin', ''])
            df_tmp['latDeg'] = df_tmp['latDeg'] + df_tmp['delta_lat'].fillna(0)
            df_tmp['lngDeg'] = df_tmp['lngDeg'] + df_tmp['delta_lng'].fillna(0)
            df_all = df_tmp[['phone', 'collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']]
        df_deltapred_train = df_train_epoch.merge(
            df_all, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='left')
                
        return df_deltapred_train

    def predict(self, df_input):
        df = df_input[['phone', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg', 'collectionName', 'phoneName']].copy()

        df = df.merge(self.df_test_tomo, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], how='left')

        regions = [
            'downtown',
            'tree_1', 
                  ]
        test_list = []
        for region in tqdm(regions, desc='LightGBM (Inference)'):
            print("***" + region + "***")
            df_pred_test_region = self.predict_test_data(df, region)
            test_list.append(df_pred_test_region)

        for df_delta in test_list:
            df_tmp = df.merge(
                df_delta, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'], 
                how='left')
            df_tmp['latDeg'] = df_tmp['latDeg'] + df_tmp['delta_lat'].fillna(0)
            df_tmp['lngDeg'] = df_tmp['lngDeg'] + df_tmp['delta_lng'].fillna(0)
            df = df_tmp[['phone', 'collectionName', 'phoneName', 'millisSinceGpsEpoch', 'latDeg', 'lngDeg']]
        df['latDeg'] = df['latDeg'].astype(np.float)
        df['lngDeg'] = df['lngDeg'].astype(np.float)
        df['millisSinceGpsEpoch'] = df['millisSinceGpsEpoch'].astype(np.int64)
                
        return df
    
    def add_features(self, df_add, is_train=True):
        if is_train:
            assert len(self.df_train_tomo) == len(df_add), f"mismatch length {len(self.df_train_tomo)} - {len(df_add)}"
            self.df_train_tomo = pd.merge(self.df_train_tomo, df_add, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'])
        else:
            assert len(self.df_test_tomo) == len(df_add), f"mismatch length {len(self.df_test_tomo)} - {len(df_add)}"
            self.df_test_tomo = pd.merge(self.df_test_tomo, df_add, on=['collectionName', 'phoneName', 'millisSinceGpsEpoch'])

In [None]:
with open('../input/region-classification/region_type_train.json') as f:
    region_type_train = json.load(f)
with open('../input/region-classification/region_type_test.json') as f:
    region_type_test = json.load(f)

downtowns = [*[key for (key, val) in region_type_train.items() if 'downtown' in val], *[key for (key, val) in region_type_test.items() if 'downtown' in val]]
trees = [*[key for (key, val) in region_type_train.items() if 'tree' in val], *[key for (key, val) in region_type_test.items() if 'tree' in val]]
highways = [*[key for (key, val) in region_type_train.items() if 'tree' not in val and 'downtown' not in val],
          *[key for (key, val) in region_type_test.items() if 'tree' not in val and 'downtown' not in val]]
tree_1 = [*[key for (key, val) in region_type_train.items() if 'tree_1' in val], *[key for (key, val) in region_type_test.items() if 'tree_1' in val]]
downtown_tree_1 = downtowns + tree_1

In [None]:
datapath = Path("../input/google-smartphone-decimeter-challenge/")
ground_truths = (datapath / "train").rglob("ground_truth.csv")
df_gt = pd.concat([pd.read_csv(filepath) for filepath in ground_truths], ignore_index=True)

df_pred_train = pd.read_csv('../input/gnss-ensembled/train_submission_filtered.csv')
df_pred_train['collectionName'] = df_pred_train['phone'].apply(lambda x: x.split('_')[0])
df_pred_train['phoneName'] = df_pred_train['phone'].apply(lambda x: x.split('_')[1])

df_pred_test = pd.read_csv('../input/gnss-ensembled/submission_filtered.csv')
df_pred_test['collectionName'] = df_pred_test['phone'].apply(lambda x: x.split('_')[0])
df_pred_test['phoneName'] = df_pred_test['phone'].apply(lambda x: x.split('_')[1])

df_train_tomo = pd.read_pickle('../input/210723-get-feature-for-delta-pred-1/train_df.pkl')
df_test_tomo = pd.read_pickle('../input/210723-get-feature-for-delta-pred-1/test_df.pkl')

df_pseudo_label_test = pd.read_csv('../input/210711-delta-pred-3-my-get-mean-over-phones/submission.csv')
df_pseudo_label_test['collectionName'] = df_pseudo_label_test['phone'].apply(lambda x: x.split('_')[0])
df_pseudo_label_test['phoneName'] = df_pseudo_label_test['phone'].apply(lambda x: x.split('_')[1])

In [None]:
lgbm_params_1 = {
    "objective":"regression",
    "metric":"rmse",
    "max_depth":-1,
    "learning_rate":0.1,
    "random_state":2021,
    "n_estimators":10000,
    "verbose":-1,
}
lgbm_params_2 = {
    "objective":"regression",
    "metric":"rmse",
    "random_state":2021,
}
lgbm_params_3 = {
    "random_state":2021,
}

target_set = 1

# Training

In [None]:
delta_predictor_1 = DeltaPredict(lgbm_params_3, df_gt, df_train_tomo, df_test_tomo, is_second=False)

In [None]:
df_pred_train_processed = df_pred_train.copy()
df_pred_train_processed = delta_predictor_1.train_and_predict(df_pred_train_processed)

## Before Delta Prediction

In [None]:
df_eval = eval_all(df_pred_train, df_gt)
df_eval[df_eval['collection'].isin(downtown_tree_1)]

## After Delta Prediction

In [None]:
df_eval = eval_all(df_pred_train_processed, df_gt)
df_eval[df_eval['collection'].isin(downtown_tree_1)]

# Predict and Submission

In [None]:
df_pred_test_processed = df_pred_test.copy()
df_pred_test_processed = delta_predictor_1.predict(df_pred_test_processed)

In [None]:
sub = pd.read_csv('../input/google-smartphone-decimeter-challenge/sample_submission.csv')
sub = sub.assign(
    latDeg = df_pred_test_processed.latDeg,
    lngDeg = df_pred_test_processed.lngDeg
)
sub.to_csv('submission.csv', index=False)