# JPX Tokyo Stock Exchange Prediction
# End to End Starter Notebook
# By Mohamed Eltayeb

# Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from lightgbm import LGBMRegressor
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None  # default='warn'

# Define Functions

In [None]:
#Evaluation Metric
#The Sharpe Ratio
def SharpeRatio(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio

In [None]:
#Add Rank To the Submission
def set_rank(df):
    """
    Args:
        df (pd.DataFrame): including predict column
    Returns:
        df (pd.DataFrame): df with Rank
    """
    # sort records to set Rank
    df = df.sort_values("predict", ascending=False)
    # set Rank starting from 0
    df.loc[:, "Rank"] = np.arange(len(df["predict"]))
    return df

In [None]:
#Plot the LGBM Features Importances
def plotImp(model, X , num = 20, fig_size = (40, 20)):
    feature_imp = pd.DataFrame({'Value':model.feature_importances_,'Feature':X.columns})
    plt.figure(figsize=fig_size)
    sns.set(font_scale = 5)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", 
                                                        ascending=False)[0:num])
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances-01.png')
    plt.show()

# Read the training and testing data


In [None]:
train_df = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/train_files/secondary_stock_prices.csv')
test_df = pd.read_csv('../input/jpx-tokyo-stock-exchange-prediction/example_test_files/stock_prices.csv')

# Add Some Time Features

In [None]:
for dataset in (train_df, test_df):
    dataset['Date'] = pd.to_datetime(dataset['Date'])
    dataset['Day'] = dataset.Date.dt.day
    dataset['Month'] = dataset.Date.dt.month
    dataset['Year'] = dataset.Date.dt.year
    dataset['DayOfWeek'] = dataset.Date.dt.dayofweek
    dataset['DayOfYear'] = dataset.Date.dt.dayofyear
    dataset['WeekOfYear'] = dataset.Date.dt.weekofyear
    dataset.set_index("Date", inplace=True)

## Drop 'RowId' as it is not informtive. Then Drop 'ExpectedDividend' as that it contains a lot of missing values

In [None]:
ID = test_df['RowId']
for dataset in (train_df, test_df):
    dataset.drop(['RowId','ExpectedDividend'],inplace=True,axis=1) #Divideind has a lot missing values

## Only use Available Target Values for Training

In [None]:
train_df = train_df[~pd.isnull(train_df['Target'])]

# Missing Values

In [None]:
#missing data percentage (Training)
total = train_df.isnull().sum().sort_values(ascending=False)
percent_1 = train_df.isnull().sum()/train_df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data

In [None]:
#missing data percentage (Testing)
total = test_df.isnull().sum().sort_values(ascending=False)
percent_1 = test_df.isnull().sum()/test_df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data

### We will use LGBM as a model which can handle missing values efficiently by itself.

# The Correlation with The Target 

In [None]:
corr_matrix = train_df.corr()
corr_matrix["Target"].sort_values(ascending=False)

# Modeling

In [None]:
params = {'max_depth': 7}
lgbm = LGBMRegressor(**params, random_state=42)

## Validation 

### We will use the last two months of the training set as a validation set 

In [None]:
cutoff = train_df.index.max() - pd.to_timedelta(60, unit = 'D')
train = train_df.loc[train_df.index < cutoff].copy()
valid = train_df.loc[train_df.index >= cutoff].copy()
y_test = valid['Target'].copy()
valid.drop('Target',inplace=True,axis=1)

result = valid[['SecuritiesCode']]
result.loc[:, "Target"] = y_test.values

In [None]:
lgbm.fit(train.drop('Target',axis=1),train['Target'])
result['predict'] = lgbm.predict(valid)

In [None]:
result = result.sort_values(['Date', "predict"], ascending=[True, False])
result = result.groupby('Date').apply(set_rank)
SharpeRatio(result)

## Show the Features Importances 

In [None]:
plotImp(lgbm,train.drop('Target',axis=1))

## Make Your Submission

In [None]:
lgbm.fit(train_df.drop('Target',axis=1),train_df['Target'])
test_df['predict'] = lgbm.predict(test_df)

In [None]:
test_df = test_df.sort_values(['Date', "predict"], ascending=[True, False])
test_df = test_df.groupby('Date').apply(set_rank)
pred_dict = dict(zip(test_df["SecuritiesCode"],test_df["Rank"]))

In [None]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()
iter_test = env.iter_test()

for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
    sample_prediction["Rank"]  = sample_prediction["SecuritiesCode"].map(pred_dict)
    env.predict(sample_prediction)

## LB Score: -0.027

## Things to Improve:

#### - Do EDA and try to get useful insights
#### - Feature Engineering (Lags, Rollings, Aggregated Features...)
#### - Try to utilize the feature 'ExpectedDividend' instead of dropping it
#### - Try to use the other data files
#### - Check for Outliers
#### - Check the Missing Values and try to handle them more efficiently
#### - Do Error Analysis