In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import matplotlib.pyplot as plt
import statistics 
import statsmodels.tsa.seasonal as smt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import random
import datetime as dt
from sklearn import linear_model 
from sklearn.metrics import mean_absolute_error
import plotly
from sklearn import preprocessing
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import sklearn.metrics as metrics
import gc
import seaborn as sns
from sklearn.model_selection import train_test_split

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))
import os

In [None]:
def transformer(df, ticker):
    label = ticker
    
    #shift label
    df['TARGET'] = df[label].shift(-1)
    
    #drop na
    df = df[df['TARGET'].notna()].copy()
    
    #drop rows before stock start IPO by looking at mode
    try:
        mode = statistics.mode(df['TARGET'])
        index_names = df[ df['TARGET'] == mode ].index 
        df.drop(index_names, inplace = True) 
        
    except:
        pass
    
    return df

def kfold_lightgbm(df, num_folds, stratified = False, debug= False):
    
    # Divide in training/validation and test data
    #train_df, test_df = train_test_split(df, test_size=0.33)
    row_split = df.shape[0] - 120
    train_df, test_df = df.iloc[:row_split,:] , df.iloc[row_split+1:,:]

    # Cross validation model
    if stratified:
        folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)
    else:
        folds = KFold(n_splits= num_folds, shuffle=True, random_state=1001)
        
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','index', 'Date']]
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
        dtrain = lgb.Dataset(data=train_df[feats].iloc[train_idx], 
                             label=train_df['TARGET'].iloc[train_idx], 
                             free_raw_data=False, silent=True)
        dvalid = lgb.Dataset(data=train_df[feats].iloc[valid_idx], 
                             label=train_df['TARGET'].iloc[valid_idx], 
                             free_raw_data=False, silent=True)

        # LightGBM parameters found by Bayesian optimization
        params = {
            'objective': 'regression',
            'boosting_type': 'rf',
            'nthread': 4,
            'learning_rate': 0.03,  # 02,
            'num_leaves': 4,
            'colsample_bytree': 0.9497036,
            'subsample': 0.8715623,
            'subsample_freq': 1,
            'max_depth': 8,
            'reg_alpha': 0.041545473,
            'reg_lambda': 0.0735294,
            'min_split_gain': 0.0222415,
            'min_child_weight': 60, # 39.3259775,
            'seed': 0,
            'verbose': -1,
            'metric': 'mse',
        }
        
        clf = lgb.train(
            params=params,
            train_set=dtrain,
            num_boost_round=10000,
            valid_sets=[dtrain, dvalid],
            early_stopping_rounds=200,
            verbose_eval=False
        )

        oof_preds[valid_idx] = clf.predict(dvalid.data)
        sub_preds += clf.predict(test_df[feats]) / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = clf.feature_importance(importance_type='gain')
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d R2 : %.6f' % (n_fold + 1,  metrics.r2_score(dvalid.label, oof_preds[valid_idx])))
        del clf, dtrain, dvalid
        gc.collect()
        
    metric = metrics.r2_score(test_df['TARGET'], sub_preds)
    print('Test  R2 score %.6f' % metric)
    return feature_importance_df, metric

In [None]:
# original.csv may have more rows in reality, but we are only loading/previewing the first 1000 rows
data = pd.read_csv('/kaggle/input/5years-dailystock-quotes/original.csv', delimiter=',')
data.dataframeName = 'original.csv'
nRow, nCol = data.shape
print(f'There are {nRow} rows and {nCol} columns')

In [None]:
#format date and labels
new_format = "%Y-%m-%dT%H:%M:%SZ"
data['datetime'] = pd.to_datetime(data['datetime'], format=new_format)
data.rename(columns={'datetime':'Date'}, inplace=True)
data.rename(columns={'open_price':'Open'}, inplace=True)
data.rename(columns={'close_price':'Close'}, inplace=True)
data.rename(columns={'high_price':'High'}, inplace=True)
data.rename(columns={'low_price':'Low'}, inplace=True)
data.rename(columns={'volume':'Volume'}, inplace=True)
data.rename(columns={'symbol':'Label'}, inplace=True)

data

In [None]:
#pivot tickers to columns
pivot = data.pivot(index='Date', columns='Label', values='Close')
pivot.reset_index(drop=True, inplace=True)
pivot['Date'] = data['Date']
pivot

In [None]:
overall = pd.DataFrame()
feat_imp = pd.DataFrame()

#tickers = data['Label'].unique()
tickers = ['FBNC','TEX','ETO','OMER','SEEL']

for ticker in tickers:
    try:
        print(ticker)
        df = transformer(pivot, ticker)
        feature_importance_df, r2 = kfold_lightgbm(df, 2)

        temp = pd.DataFrame()
        temp['ticker'] = [ticker]
        temp['R2'] = [r2]
        overall = overall.append(temp)
        
        #remove temp before next step
        del temp
        
        temp = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)
        temp = temp[temp.importance != 0]
        temp['ticker'] = ticker
        feat_imp = feat_imp.append(temp)
        
    except:
        pass
    
overall.sort_values(by=['R2'], inplace=True, ascending=False)
feat_imp.sort_values(by=['ticker','importance'], inplace=True, ascending=False)

In [None]:
overall.to_csv('/kaggle/working/overall.csv',index=False) # save to notebook output
overall

In [None]:
feat_imp = feat_imp.reset_index()
feat_imp.to_csv('/kaggle/working/feature_importance.csv',index=False) # save to notebook output
feat_imp

In [None]:
feat_imp = feat_imp.reset_index()
feat_imp

In [None]:
feat_imp[feat_imp['ticker']=='FBNC']