<h3> Objectives</h3>
<ul>
<li>Determine if based on a range scale of dependent variables show discrete outcome of positive or negative.</li>
</ul>

In [30]:
# standard library
import sys, os, re
sys.path.append(os.path.abspath(os.path.join("../..", "src")))
import math

# pip packages
import numpy as np
import pandas as pd
import pickle as p
import matplotlib as mpl
import matplotlib.pyplot as plt

# ml
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# local modules
import d02_processing as preprocessor

# matplotlib visualize in jupyter
%matplotlib inline

In [5]:
data = pd.read_csv(os.path.abspath(os.path.join("../..", "data")) + "/03_processed/01_df_trade_processed.csv")
data.head()

Unnamed: 0,trade_num,ticker,profit_loss,cap,time_duration,positions,day,week,date,volume,entry,exit,Market Cap,Short Float,Shs Float,Market Category
0,0,LC,-10.95,475.45,122,6,3,30,2021-07-29,5,06:48:38,06:49:24,1.61B,4.86%,94.89M,small
1,1,NAOV,-0.05,25.35,441,2,3,30,2021-07-29,10,06:39:34,06:45:47,13.24M,0.97%,21.18M,nano
2,2,NAOV,-1.0,26.8,121,2,3,30,2021-07-29,10,06:33:24,06:34:37,13.24M,0.97%,21.18M,nano
3,3,NURO,-2.5,106.0,157,2,2,30,2021-07-28,5,08:00:53,08:01:44,94.16M,1.74%,1.95M,micro
4,4,TYHT,-0.6,29.7,123,2,2,30,2021-07-28,5,07:51:23,07:52:40,16.89M,6.46%,2.62M,nano


In [6]:
# convert profit | loss to binary output
data["sign"]=np.where(data["profit_loss"]>0,1,0)
data.head()


Unnamed: 0,trade_num,ticker,profit_loss,cap,time_duration,positions,day,week,date,volume,entry,exit,Market Cap,Short Float,Shs Float,Market Category,sign
0,0,LC,-10.95,475.45,122,6,3,30,2021-07-29,5,06:48:38,06:49:24,1.61B,4.86%,94.89M,small,0
1,1,NAOV,-0.05,25.35,441,2,3,30,2021-07-29,10,06:39:34,06:45:47,13.24M,0.97%,21.18M,nano,0
2,2,NAOV,-1.0,26.8,121,2,3,30,2021-07-29,10,06:33:24,06:34:37,13.24M,0.97%,21.18M,nano,0
3,3,NURO,-2.5,106.0,157,2,2,30,2021-07-28,5,08:00:53,08:01:44,94.16M,1.74%,1.95M,micro,0
4,4,TYHT,-0.6,29.7,123,2,2,30,2021-07-28,5,07:51:23,07:52:40,16.89M,6.46%,2.62M,nano,0


In [27]:
# process special case '-' as NaN | convert string to numerical float
data['Shs Float'] = data['Shs Float'].astype(str)
data['Shs Float'] = data['Shs Float'].apply(lambda x: re.sub(r'^-$', str(np.NaN), x))
data['Shs Float'] = data['Shs Float'].replace(np.nan, 0)
data['Shs Float'] = data['Shs Float'].apply(preprocessor.process.value_to_float)

In [28]:
# training | testing data
X_train,X_test,y_train,y_test = train_test_split(data[['Shs Float']],data.sign,train_size=0.9,test_size=0.10)

In [29]:
# build model
model = LogisticRegression()
model.fit(X_train,y_train)

LogisticRegression()

In [13]:
# save the model to disk
filename = 'logistic_regression_model.sav'
p.dump(model, open(filename, 'wb'))

In [14]:
# test model with predict on test set
model.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])

In [15]:
# overall model score
model.score(X_test,y_test)

0.45689655172413796

In [16]:
# view model probability on decision making  
model.predict_proba(X_test)

array([[0.50066605, 0.49933395],
       [0.50232375, 0.49767625],
       [0.50082886, 0.49917114],
       [0.5       , 0.5       ],
       [0.51378985, 0.48621015],
       [0.50221521, 0.49778479],
       [0.50762442, 0.49237558],
       [0.521111  , 0.478889  ],
       [0.52372552, 0.47627448],
       [0.52372552, 0.47627448],
       [0.50018625, 0.49981375],
       [0.50021215, 0.49978785],
       [0.50044033, 0.49955967],
       [0.50037373, 0.49962627],
       [0.50133702, 0.49866298],
       [0.50223001, 0.49776999],
       [0.51017925, 0.48982075],
       [0.50308351, 0.49691649],
       [0.50044897, 0.49955103],
       [0.55510161, 0.44489839],
       [0.61510285, 0.38489715],
       [0.50074869, 0.49925131],
       [0.50091026, 0.49908974],
       [0.51378985, 0.48621015],
       [0.50010237, 0.49989763],
       [0.53396535, 0.46603465],
       [0.501506  , 0.498494  ],
       [0.51406592, 0.48593408],
       [0.55215108, 0.44784892],
       [0.5020339 , 0.4979661 ],
       [0.