# Investigating outliers
In this notebook, we investigate outliers by analysing the results of the benchmark strategy and a strategy with RMSPE 0.21693. The aim is to find out the outliers that affect both training and predicting.

**Note:** In this notebook, we skipped the row-id without trade data

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('../input/optiver-005c-result/005c_result.csv')
df_ = pd.read_csv('../input/optiver-005c-result/000-benchmark.csv')
df = df.merge(df_[['row_id', 'pred']], on='row_id', how='inner', validate='one_to_one')
df = df.rename(columns={'pred_x': 'pred', 'pred_y': 'benchmark'})
df.head()

# Utility functions

In [None]:
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

## RMSPE for the entire training set

In [None]:
strat_score = rmspe(df.target, df.pred)
bench_score = rmspe(df.target, df.benchmark)
print(f' Strategy rmspe: ', strat_score)
print(f'Benchmark rmspe: ', bench_score)

## RMSPE for each stock

In [None]:
df_rmspe_per_stock = df.groupby('stock_id').apply(lambda x: rmspe(x.target, x.pred))
df_rmspe_per_stock_bench = df.groupby('stock_id').apply(lambda x: rmspe(x.target, x.benchmark))
df_rmspe_per_stock = df_rmspe_per_stock.sort_values(ascending=False).to_frame('strat_rmspe').reset_index()
df_rmspe_per_stock_bench = df_rmspe_per_stock_bench.to_frame('bench_rmspe').reset_index()
df_rmspe_per_stock = df_rmspe_per_stock.merge(df_rmspe_per_stock_bench, on='stock_id')
df_rmspe_per_stock.head(20)

## What happened for stock_id=31?

In [None]:
df['pct_error'] = (df.target - df.pred) / df.target
df['abs_pct_error'] = df['pct_error'].abs()
df['pct_error_bench'] = (df.target - df.benchmark) / df.target
df['abs_pct_error_bench'] = df['pct_error_bench'].abs()
df_31 = df.loc[df.stock_id==31].sort_values('abs_pct_error', ascending=False)

In [None]:
df_31.abs_pct_error.hist(bins='auto')

In [None]:
disp_cols = ['stock_id', 'time_id', 'target', 'pred', 'benchmark', 'pct_error', 'pct_error_bench']
df_31[disp_cols].head(20)

## What about other stocks?

In [None]:
df_no_31 = df.loc[df.stock_id!=31].sort_values('abs_pct_error', ascending=False)

In [None]:
df_no_31.abs_pct_error.hist(bins='auto')

In [None]:
disp_cols = ['stock_id', 'time_id', 'target', 'pred', 'benchmark', 'pct_error', 'pct_error_bench']
df_no_31[disp_cols].head(20)