In [None]:
!pip install pandas_ta

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

from datetime import timedelta, date
import matplotlib.pyplot as plt
import statistics 
import statsmodels.tsa.seasonal as smt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas_ta as ta
import random
import datetime as dt
from sklearn import linear_model 
from sklearn.metrics import mean_absolute_error
import plotly
from sklearn import preprocessing
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import sklearn.metrics as metrics
import gc
import seaborn as sns
from sklearn.model_selection import train_test_split

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))
import os

In [None]:
# Create your own Custom Strategy
CustomStrategy = ta.Strategy(
    name="Momo and Volatility",
    description="ma10, ma30, rsi, macd",
    ta=[
        {"kind": "sma", "length": 10},
        {"kind": "sma", "length": 30},
        {"kind": "rsi"},
        {"kind": "macd", "fast": 8, "slow": 21},
    ]
)

In [None]:
#read in data
data = pd.read_csv('/kaggle/input/5years-dailystock-quotes/original.csv', delimiter=',')
data = data[['open_price','high_price','low_price','close_price','volume','symbol','datetime']]
data.dataframeName = 'original.csv'
# Convert date to timestamp and make index
new_format = "%Y-%m-%dT%H:%M:%SZ"
data['date'] = pd.to_datetime(data['datetime'], format=new_format)
data.drop(["datetime"], axis=1, inplace=True)
#rename columns
data.columns = ['Open', 'High', 'Low', 'Close', 'Volume', 'Symbol', 'Date']

nRow, nCol = data.shape
print(f'There are {nRow} rows and {nCol} columns')

In [None]:
#create trend
tick=['FBNC', 'BANF', 'SLCT', 'SCHW', 'WWE', 'NIO', 'PLTR', 'RKT', 'GME', 'AMC']
final_df = pd.DataFrame()

for company in tick:
    print(company)
    ticker = data['Symbol']==company
    df = data[ticker].loc[data.first_valid_index():]
    df=df.reset_index(drop=True)
    
    n_days = 5
    fraction_movement=0.037
    df['Trend']=None
    
    for i in range(len(df)):
        
        try: 
            if  df.loc[i,'Close'] - df.loc[i+n_days,'Close'] >= fraction_movement*df.loc[i,'Close']:
                df.loc[i,'Trend'] = 0
                #print('Down',i,n_days)
            elif df.loc[i+n_days,'Close'] - df.loc[i,'Close'] >= fraction_movement*df.loc[i,'Close'] :
                df.loc[i,'Trend']= 1
                #print('Up',i,n_days)
            else :
                #print('No trend',i,n_days)
                pass
        except :
            #print(i)
            pass
        
    df.set_index(pd.DatetimeIndex(df["Date"]), inplace=True)
    df.ta.strategy()
    #df.ta.strategy(CustomStrategy)
    #df.columns = ['Open', 'High', 'Low', 'Close', 'Volume', 'Symbol', 'Date',
    #              'ma10', 'ma30', 'rsi', 'macd', 'macd_hist','macd_signal']
    #df = df.dropna()
    
    final_df = final_df.append(df)

In [None]:
final_df = final_df[final_df['Trend'].notna()]
final_df['Trend'] = final_df['Trend'].astype(int) 
final_df = final_df.drop(['DPO_20'], axis=1)
final_df = final_df.drop(['ICS_26'], axis=1)

final_df

In [None]:
# Divide in training/validation and test data
#train_df, test_df = train_test_split(df, test_size=0.33)

num_folds = 2
row_split = final_df.shape[0] - 120
train_df, test_df = final_df.iloc[:row_split,:] , final_df.iloc[row_split+1:,:]

# Cross validation model
folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=1001)

# Create arrays and dataframes to store results
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])
feature_importance_df = pd.DataFrame()
feats = [f for f in train_df.columns if f not in ['Trend', 'Symbol', 'date']]

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['Trend'])):
    dtrain = lgb.Dataset(data=train_df[feats].iloc[train_idx], 
                         label=train_df['Trend'].iloc[train_idx], 
                         free_raw_data=False, silent=True)
    dvalid = lgb.Dataset(data=train_df[feats].iloc[valid_idx], 
                         label=train_df['Trend'].iloc[valid_idx], 
                         free_raw_data=False, silent=True)

    # LightGBM parameters found by Bayesian optimization
    params = {
        'objective' :'binary',
        'learning_rate' : 0.02,
        'num_leaves' : 76,
        'feature_fraction': 0.64, 
        'bagging_fraction': 0.8, 
        'bagging_freq':1,
        'boosting_type' : 'gbdt',
        'metric': 'binary_logloss'
    }

    clf = lgb.train(
        params=params,
        train_set=dtrain,
        num_boost_round=10000,
        valid_sets=[dtrain, dvalid], 
        verbose_eval=50, 
        early_stopping_rounds=100
    )

    oof_preds[valid_idx] = clf.predict(dvalid.data)
    sub_preds += clf.predict(test_df[feats]) / folds.n_splits

    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = clf.feature_importance(importance_type='gain')
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    print('Fold %2d ROC : %.6f' % (n_fold + 1,  roc_auc_score(dvalid.label, oof_preds[valid_idx])))
    del clf, dtrain, dvalid
    gc.collect()

metric = roc_auc_score(test_df['Trend'], sub_preds)
print('Test ROC score %.6f' % metric)

In [None]:
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout
    plt.savefig('lgbm_importances01.png')

In [None]:
display_importances(feature_importance_df)