<h1 style="text-align:center;">Quantitative Trading</h1>

<br>

<br>

# Initial Deployment

---

In [None]:
# Import the libraries
import requests, time, datetime, pytz, talib, sns
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
!pip install sns

<br>

# Load the Dataset

---

In [131]:
# Function for fetching the minute level dataset
def fetch_minute_dataset(ticker_name = "TSLA"):
    
    # Client ID
    client_id = "E5AXP16J02E3JTODQW9LW7L1AH726L9X"

    # Get the current time
    current_time_epoch = str(int(time.time()) * 1000)

    # Endpoint
    endpoint = r"https://api.tdameritrade.com/v1/marketdata/{}/pricehistory".format(ticker_name)

    # Define the payload                
    payload = {"apikey": client_id,
               #"periodType": "day",
               #"period": "2",
               "frequencyType": "minute",
               "frequency": "1",
               "endDate": current_time_epoch,        # Use "www.epochconverter.com" for getting the time
               #"startDate": "1546300801000",
               "needExtendedHoursData": "false"}      # Only regular market hour

    # Make a request
    content = requests.get(url = endpoint, params = payload)

    # Convert data into a json dictionary
    data = content.json()

    # Get the dataset in dataframe form
    data = pd.DataFrame(data["candles"])
    
    return data

In [140]:
df = fetch_minute_dataset(ticker_name = "SPY")
print("Dataset Shape: ", df.shape)
df

Dataset Shape:  (3740, 6)


Unnamed: 0,open,high,low,close,volume,datetime
0,411.230,411.71,410.94,411.6600,646169,1620826200000
1,411.670,412.10,411.52,411.5800,555916,1620826260000
2,411.580,411.88,411.28,411.8800,288527,1620826320000
3,411.870,412.05,411.64,411.7600,293702,1620826380000
4,411.770,412.04,411.69,411.9800,246648,1620826440000
...,...,...,...,...,...,...
3735,418.860,418.90,418.81,418.8819,39156,1621962900000
3736,418.880,419.03,418.86,418.9800,92273,1621962960000
3737,418.980,419.05,418.93,418.9999,73543,1621963020000
3738,419.010,419.09,419.01,419.0542,56741,1621963080000


In [141]:
def add_different_date_columns(df):
    """
    Function for adding date columns (i.e. "datetime", "year", "month", "day", "hour", "minute", "day name").
    """
    
    # Create a "date" only column
    def converter(x):
        x = datetime.datetime.fromtimestamp(x/1000, pytz.timezone("US/Eastern"))
        return "{}/{}/{}".format(x.year, x.month, x.day)

    df["date"] = df["datetime"].map(lambda x: converter(x))
    
    # Create the year, month, day, hour, and minute columns
    df["year"] = df["datetime"].map(lambda x: datetime.datetime.fromtimestamp(x/1000).year)
    df["month"] = df["datetime"].map(lambda x: datetime.datetime.fromtimestamp(x/1000).month)
    df["day"] = df["datetime"].map(lambda x: datetime.datetime.fromtimestamp(x/1000).day)
    df["hour"] = df["datetime"].map(lambda x: datetime.datetime.fromtimestamp(x/1000).hour)
    df["minute"] = df["datetime"].map(lambda x: datetime.datetime.fromtimestamp(x/1000).minute)
    df["day name"] = df["datetime"].map(lambda x: datetime.datetime.fromtimestamp(x/1000).strftime("%A"))
    
    return df

In [142]:
df = add_different_date_columns(df)
df

Unnamed: 0,open,high,low,close,volume,datetime,date,year,month,day,hour,minute,day name
0,411.230,411.71,410.94,411.6600,646169,1620826200000,2021/5/12,2021,5,12,17,30,Wednesday
1,411.670,412.10,411.52,411.5800,555916,1620826260000,2021/5/12,2021,5,12,17,31,Wednesday
2,411.580,411.88,411.28,411.8800,288527,1620826320000,2021/5/12,2021,5,12,17,32,Wednesday
3,411.870,412.05,411.64,411.7600,293702,1620826380000,2021/5/12,2021,5,12,17,33,Wednesday
4,411.770,412.04,411.69,411.9800,246648,1620826440000,2021/5/12,2021,5,12,17,34,Wednesday
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3735,418.860,418.90,418.81,418.8819,39156,1621962900000,2021/5/25,2021,5,25,21,15,Tuesday
3736,418.880,419.03,418.86,418.9800,92273,1621962960000,2021/5/25,2021,5,25,21,16,Tuesday
3737,418.980,419.05,418.93,418.9999,73543,1621963020000,2021/5/25,2021,5,25,21,17,Tuesday
3738,419.010,419.09,419.01,419.0542,56741,1621963080000,2021/5/25,2021,5,25,21,18,Tuesday


In [143]:
df["hour"].unique()

array([17, 18, 19, 20, 21, 22, 23])

<br>

# Feature Engineering

---

### Add Features

In [145]:
# Add price change
df["price diff"] = df['close'].pct_change()
df

Unnamed: 0,open,high,low,close,volume,datetime,date,year,month,day,hour,minute,day name,price diff
0,411.230,411.71,410.94,411.6600,646169,1620826200000,2021/5/12,2021,5,12,17,30,Wednesday,
1,411.670,412.10,411.52,411.5800,555916,1620826260000,2021/5/12,2021,5,12,17,31,Wednesday,-0.000194
2,411.580,411.88,411.28,411.8800,288527,1620826320000,2021/5/12,2021,5,12,17,32,Wednesday,0.000729
3,411.870,412.05,411.64,411.7600,293702,1620826380000,2021/5/12,2021,5,12,17,33,Wednesday,-0.000291
4,411.770,412.04,411.69,411.9800,246648,1620826440000,2021/5/12,2021,5,12,17,34,Wednesday,0.000534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3735,418.860,418.90,418.81,418.8819,39156,1621962900000,2021/5/25,2021,5,25,21,15,Tuesday,0.000052
3736,418.880,419.03,418.86,418.9800,92273,1621962960000,2021/5/25,2021,5,25,21,16,Tuesday,0.000234
3737,418.980,419.05,418.93,418.9999,73543,1621963020000,2021/5/25,2021,5,25,21,17,Tuesday,0.000047
3738,419.010,419.09,419.01,419.0542,56741,1621963080000,2021/5/25,2021,5,25,21,18,Tuesday,0.000130


In [147]:
# Add relative volume (i.e. percentage of change in volume)
df["relative volume"] = df["volume"].pct_change()
df

Unnamed: 0,open,high,low,close,volume,datetime,date,year,month,day,hour,minute,day name,price diff,relative volume
0,411.230,411.71,410.94,411.6600,646169,1620826200000,2021/5/12,2021,5,12,17,30,Wednesday,,
1,411.670,412.10,411.52,411.5800,555916,1620826260000,2021/5/12,2021,5,12,17,31,Wednesday,-0.000194,-0.139674
2,411.580,411.88,411.28,411.8800,288527,1620826320000,2021/5/12,2021,5,12,17,32,Wednesday,0.000729,-0.480988
3,411.870,412.05,411.64,411.7600,293702,1620826380000,2021/5/12,2021,5,12,17,33,Wednesday,-0.000291,0.017936
4,411.770,412.04,411.69,411.9800,246648,1620826440000,2021/5/12,2021,5,12,17,34,Wednesday,0.000534,-0.160210
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3735,418.860,418.90,418.81,418.8819,39156,1621962900000,2021/5/25,2021,5,25,21,15,Tuesday,0.000052,0.467121
3736,418.880,419.03,418.86,418.9800,92273,1621962960000,2021/5/25,2021,5,25,21,16,Tuesday,0.000234,1.356548
3737,418.980,419.05,418.93,418.9999,73543,1621963020000,2021/5/25,2021,5,25,21,17,Tuesday,0.000047,-0.202985
3738,419.010,419.09,419.01,419.0542,56741,1621963080000,2021/5/25,2021,5,25,21,18,Tuesday,0.000130,-0.228465


In [148]:
# Replace infinity with NaN
df = df.replace([np.inf, -np.inf], np.nan)

In [149]:
# Remove rows with NaN values
df.dropna(inplace=True)

### Indicators

In [152]:
# Add RSI
time_periods = [14]
for i in time_periods:
    df.loc[:, f"RSI_{i}"] = talib.RSI(df["close"], timeperiod = i)

In [153]:
# Add EMA
time_periods = [9, 20, 50, 200]
for i in time_periods:
    df.loc[:, f"EMA_{i}"] = talib.EMA(df["close"], timeperiod = i)

In [154]:
# Add MACD
df["macd"], df["macdsignal"], df["macdhist"] = talib.MACD(df["close"], fastperiod=12, slowperiod=26, signalperiod=9)

In [155]:
# TODO: Add more indicator

In [156]:
# Remove rows with NaN values
df.dropna(inplace=True)

### Feature Importance & Feature Selection

In [168]:
# Get the x and y
X=df.drop(["price diff", "date", "year", "month", "day", "hour", "minute", "datetime", "day name"], axis=1)
y=df[["price diff"]]

In [170]:
# Scale the data
sc = StandardScaler()
X_scaled = sc.fit_transform(X)

In [171]:
# Convert to dataframe + Change the column names
df_scaled = pd.DataFrame(X_scaled, columns=list(X.columns.values))

In [173]:
# Add the target output to the dataframe
df_scaled["output"] = y["price diff"].reset_index().iloc[:, 1:]

##### Filter Method

In [175]:
# Get the correlation matrix
correlation_matrix = df_scaled.corr()

In [176]:
# Visualize the correlation map
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap=plt.cm.Reds)
plt.show()

NameError: name 'sns' is not defined

<Figure size 864x720 with 0 Axes>

In [None]:
# Visualize the correlation map related to output only
plt.figure(figsize=(1,8))
sns.heatmap(df_scaled.corr()[["output"]]*100, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
# TODO: Drop columns that have near to zero correlation (i.e. less than 0.5)


In [None]:
# TODO: Drop columns that have high correlation


##### Wrapper Methods - Backward Elimination

In [None]:
# Initialize the model
model = ridge_regressor()

# Fit the model
model.fit(X=x_train, y=y_train)

# Get the p-values
model.pvalues

In [None]:
# Get the columns
cols = list(X.columns)

# Set the maximum p value
pmax = 1

# While length of column is bigger than zero
while(len(cols)>0):
    
    # Initialize a list for p values
    p = []
    
    #
    X_1 = X[cols]
    
    # Initialize the model
    model = ridge_regressor()
    
    # Fit to the model
    model.fit(X=X_1, y=y_train)
    
    # Get the p-values inside a series
    p = pd.Series(model.pvalues.values(), index=cols)
    
    # Get the pmax
    pmax = max(p)
    
    # Get the features with pmax
    feature_with_p_max = p.idxmax()
    
    # If pmax is bigger than 0.05
    if (pmax>0.05):
        
        # Remove the feature from columns
        cols.remove(feature_with_p_max)
        
    # If pmax is less than 0.05
    else:
        
        # Break the loop
        break
        
# Set the selected features
selected_features = cols
print(selected_features)

In [None]:
# Backward Elimination - dropping columns which their pvalue is above 0.05
cols = list(X.columns)
pmax = 1
while (len(cols)>0):
    p= []
    X_1 = X[cols]
    X_1 = sm.add_constant(X_1)
    model = sm.OLS(y,X_1).fit()
    p = pd.Series(model.pvalues.values[1:],index = cols)      
    pmax = max(p)
    feature_with_p_max = p.idxmax()
    if(pmax>0.05):
        cols.remove(feature_with_p_max)
    else:
        break
selected_features_BE = cols
print(selected_features_BE)

##### Wrapper Methods - RFE (Recursive Feature Elimination)

In [None]:
# Rank the columns (1 being the most important one)
model = LinearRegression()
rfe = feature_selection.RFE(model, n_features_to_select=7)
X_rfe = rfe.fit_transform(X_scaled,y)  
model.fit(X_rfe,y)
print(rfe.support_)
print(rfe.ranking_)

In [None]:
pd.DataFrame(rfe.ranking_, index=X.columns, columns=["Rank"])

In [None]:
# Get the optimal number of features
#no of features
nof_list=np.arange(1,13)            
high_score=0
#Variable to store the optimum features
nof=0           
score_list =[]
for n in range(len(nof_list)):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)
    model = LinearRegression()
    rfe = RFE(model,nof_list[n])
    X_train_rfe = rfe.fit_transform(X_train,y_train)
    X_test_rfe = rfe.transform(X_test)
    model.fit(X_train_rfe,y_train)
    score = model.score(X_test_rfe,y_test)
    score_list.append(score)
    if(score>high_score):
        high_score = score
        nof = nof_list[n]
print("Optimum number of features: %d" %nof)
print("Score with %d features: %f" % (nof, high_score))

In [None]:
# Feed the optial number of features as number of features to RFE and get the final set of features
cols = list(X.columns)
model = LinearRegression()
#Initializing RFE model
rfe = RFE(model, 10)             
#Transforming data using RFE
X_rfe = rfe.fit_transform(X,y)  
#Fitting the data to model
model.fit(X_rfe,y)              
temp = pd.Series(rfe.support_,index = cols)
selected_features_rfe = temp[temp==True].index
print(selected_features_rfe)

##### Embedded Method

In [None]:
X = df_scaled.drop("output", axis=1).to_numpy()
y = df_scaled["output"].to_numpy()

In [None]:
n_sample = 100000
X_sample = X[:n_sample]
y_sample = y[:n_sample]

In [None]:
model_tree = GradientBoostingRegressor(n_estimators=100)
model_tree.fit(X_sample, y_sample)
model_tree.feature_importances_ 

In [None]:
model_ridge = Ridge()
model_ridge.fit(X_sample, y_sample)
model_ridge.coef_

In [None]:
plt.figure(figsize=(15,5))
f1 = pd.DataFrame(model_tree.feature_importances_*100, index=list(df_scaled.drop("output", axis=1).columns.values))
plt.bar(x = list(f1.index),
        height = f1[0].values,
        color="green",
        edgecolor="black")
plt.hlines(0, 0, 10, linestyles="dashed", color="gray")
plt.title("Feature Importance - Gradient Boosting Regression (Tree Based)")
plt.show()

plt.figure(figsize=(15,5))
f2 = pd.DataFrame(model_ridge.coef_*100, index=list(df_scaled.drop("output", axis=1).columns.values))
plt.bar(x = list(f2.index),
        height = f2[0].values,
        color="orange",
        edgecolor="black")
plt.hlines(0, 0, 10, linestyles="dashed", color="gray")
plt.title("Feature Importance - Ridge Regression")
plt.show()