In [46]:
import pandas as pd
import numpy as np
import matplotlib as plt
from scipy import stats
import joblib
from sklearn.linear_model import LinearRegression 

In [2]:
df_train = pd.read_csv("train_files/stock_prices.csv")
df_train_secondary = pd.read_csv("train_files/secondary_stock_prices.csv")
df_train_supplement = pd.read_csv("supplemental_files/stock_prices.csv")
df_train_supplement_secondary = pd.read_csv("supplemental_files/secondary_stock_prices.csv")
df_train = df_train.append(df_train_secondary)
df_train = df_train.append(df_train_supplement)
df_train = df_train.append(df_train_supplement_secondary)
df_train.head()

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target
0,20170104_1301,2017-01-04,1301,2734.0,2755.0,2730.0,2742.0,31400,1.0,,False,0.00073
1,20170104_1332,2017-01-04,1332,568.0,576.0,563.0,571.0,2798500,1.0,,False,0.012324
2,20170104_1333,2017-01-04,1333,3150.0,3210.0,3140.0,3210.0,270800,1.0,,False,0.006154
3,20170104_1376,2017-01-04,1376,1510.0,1550.0,1510.0,1550.0,11300,1.0,,False,0.011053
4,20170104_1377,2017-01-04,1377,3270.0,3350.0,3270.0,3330.0,150800,1.0,,False,0.003026


In [25]:
df_test = pd.read_csv("example_test_files/stock_prices.csv")
df_test_secondary = pd.read_csv("example_test_files/secondary_stock_prices.csv")
df_test = df_test.append(df_train_secondary)

## Model

In [11]:
def features_train(data):
    # turning the string into the datetime readable format
    data['Date'] = pd.to_datetime(data['Date'])
    # filling the blank expected dividends and targets to 0
    data['ExpectedDividend'] = data['ExpectedDividend'].fillna(0)
    data['Target'] = data['Target'].fillna(0)
    # changing the boolean into an integer so data is easier to work with
    data["SupervisionFlag"] = data["SupervisionFlag"].astype(int)
    
    # using both forward fill and backworkd fill to ensure there is no missing data
    cols = ['Open', 'High', 'Low', 'Close']
    data.loc[:,cols] = data.loc[:,cols].ffill()
    data.loc[:,cols] = data.loc[:,cols].bfill()
    
    # Calculating the range of the values and then working out the mean
    data['Daily_Range'] = data['Close'] - data['Open']
    data['Mean'] = (data['High']+data['Low']) / 2
    data['Mean'] = data['Mean'].astype(int)
    
    # Now we need to standardize the data as this could produce anomalies in having weird extremes
    data['Open'] = stats.zscore(data['Open'])
    data['High'] = stats.zscore(data['High'])
    data['Low'] = stats.zscore(data['Low'])
    data['Close'] = stats.zscore(data['Close'])
    data['Volume'] = stats.zscore(data['Volume'])
    data['Daily_Range'] = stats.zscore(data['Daily_Range'])
    data['Mean'] = stats.zscore(data['Mean'])
    
    # now we need to drop rowid as it is useless
    data = data.drop(['RowId'], axis=1)
    
    return data

In [12]:
data = features_train(df_train)

# now we need to split the data
data_train = data[data['Date']<'2022-04-01']
data_test = data[data['Date']>'2022-04-01']
data_test = data_test.reset_index(drop=True)
data_train = data_train.drop(['Date'], axis=1)
data_test = data_test.drop(['Date'], axis=1)

X_train = data_train.drop(['Target'], axis=1)
y_train = data_train['Target']
X_test = data_test.drop(['Target'], axis=1)
y_test = data_test['Target']

In [16]:
# using a simple linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
LinearRegression()

LinearRegression()

In [19]:
print(model.coef_) # Partial Regression Coefficients

[-1.89119715e-08 -1.46658118e-02  2.27692986e+00  2.23630177e+00
 -1.46692120e-02 -4.96802598e-05 -6.47025014e-05 -4.06906116e-06
  2.45244447e-04 -1.88378247e-04 -4.48381175e+00]


In [20]:
print(model.intercept_) # Intercept of the regression line

0.0005079483293951115


In [21]:
print(model.predict(X_test)) # Predicted values

[ 4.53423354e-04  3.72772007e-04  4.43608414e-04 ...  2.30410017e-04
  2.96281695e-04 -6.97164493e-05]


In [22]:
print(model.score(X_test, y_test)) # gives a rating in order to comapre models

-0.008264231859545612


In [23]:
joblib.dump(model,"learning model")

['learning model']

In [40]:
#repeat the same for test data

def features_test(data):
    # turning the string into the datetime readable format
    data['Date'] = pd.to_datetime(data['Date'])
    # filling the blank expected dividends and targets to 0
    data['ExpectedDividend'] = data['ExpectedDividend'].fillna(0)
    data['Target'] = data['Target'].fillna(0)
    # changing the boolean into an integer so data is easier to work with
    data["SupervisionFlag"] = data["SupervisionFlag"].astype(int)
    
    # using both forward fill and backworkd fill to ensure there is no missing data
    cols = ['Open', 'High', 'Low', 'Close']
    data.loc[:,cols] = data.loc[:,cols].ffill()
    data.loc[:,cols] = data.loc[:,cols].bfill()
    
    # Calculating the range of the values and then working out the mean
    data['Daily_Range'] = data['Close'] - data['Open']
    data['Mean'] = (data['High']+data['Low']) / 2
    data['Mean'] = data['Mean'].astype(int)
    
    # Now we need to standardize the data as this could produce anomalies in having weird extremes
    data['Open'] = stats.zscore(data['Open'])
    data['High'] = stats.zscore(data['High'])
    data['Low'] = stats.zscore(data['Low'])
    data['Close'] = stats.zscore(data['Close'])
    data['Volume'] = stats.zscore(data['Volume'])
    data['Daily_Range'] = stats.zscore(data['Daily_Range'])
    data['Mean'] = stats.zscore(data['Mean'])
    
    # now we need to drop rowid as it is useless
    data = data.drop(['RowId', 'Date'], axis=1)
    
    return data

In [44]:
#predictions = pd.DataFrame()
x_test = features_test(df_test)
y_pred = model.predict(x_test.drop(['Target'], axis=1))

x_test['Target'] = y_pred
x_test = x_test.sort_values(by = "Target", ascending = False)
x_test['Rank'] = np.arange(len(x_test.index))
x_test = x_test.sort_values(by = "SecuritiesCode", ascending = True)

final_df = x_test[["SecuritiesCode", "Rank"]]
final_df.tail()

Unnamed: 0,SecuritiesCode,Rank
187929,25935,500144
15134,25935,595132
409695,25935,553154
113593,25935,498749
1693706,25935,540465
