In [37]:
import pandas as pd
import numpy as np 
from sklearn import preprocessing, svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, RocCurveDisplay
from statistics import mean, median
import matplotlib

In [38]:
def get_st_momentum(num_days, closing_prices):
    # Average of a stock's momentum over the past num_days. Each day is labeled 1 if
    # the closing price that day is higher than the closing price of the day before, and -1 if its lower
    momentum = []
    stock_momentum = []

    for i in range(num_days, len(closing_prices)):
        momentum.append(1 if closing_prices[i] > closing_prices[i - 1] else -1)

    for i in range(num_days, len(closing_prices)):
        stock_momentum.append(mean(momentum[i - num_days:i]))

    return stock_momentum

In [39]:
def get_volatility(num_days, closing_prices):
    # Stock price volatility. This is an average over the past num_days of
    # percent change in a stock's price per day
    volatility = []
    avg_volatility = []

    for i in range(num_days, len(closing_prices)):
        volatility.append((closing_prices[i] - closing_prices[i-1])/closing_prices[i-1])

    for i in range(num_days, len(closing_prices)):
        avg_volatility.append(mean(volatility[i - num_days:i]))

    return avg_volatility

In [40]:
df_unedited  = pd.read_csv("C:\\Users\\Sepehr\\Desktop\\SN_Project_final\\3_Data_for_ML_Plot_withImportance\\topico_financial.csv")
Social_data_unedited = pd.read_csv("C:\\Users\\Sepehr\\Desktop\\SN_Project_final\\3_Data_for_ML_Plot_withImportance\\topico_social.csv")
num_days = 1 # prediction threshold
n = 3 # number of days before 

social_data =  Social_data_unedited[Social_data_unedited.date.isin(df_unedited.date)]
df = df_unedited[df_unedited.date.isin(social_data.date)]
closing_prices = list(df['close'])

social_data = social_data.iloc[n:,:]
df = df.iloc[n:,:]
df = df.merge(social_data, on='date')

df['Stock_Momentum'] = get_st_momentum(n, closing_prices)
df['Volatility'] = get_volatility(n, closing_prices)

df = df[['date','Volatility', 'Stock_Momentum','imp. views']]
df = df[:len(df)-num_days]

In [41]:
X = np.array(df)
    
# Create Y vector; defined as whether a stock will increase or decrease in price in num_days
Y = []
for i in range(len(closing_prices)-num_days):
    if (closing_prices[i+num_days]- closing_prices[i])/closing_prices[i] > 0.005 :
        Y.append(1)
    if (closing_prices[i+num_days]- closing_prices[i])/closing_prices[i] < -0.005 :
        Y.append(-1)
    if (closing_prices[i+num_days]- closing_prices[i])/closing_prices[i] >= -0.005 and (closing_prices[i+num_days]- closing_prices[i])/closing_prices[i] <= 0.005 :
        Y.append(0)


# Adjust length of Y to match X if needed
if len(Y) > len(X):
    adjustment = len(Y) - len(X)
    Y = Y[adjustment:]   

#n = 5
#X_test_new = X[-n:, :]
#Y_test_new = Y[-n:]
#print(Y)
#X = X[:-n, :]
#Y = Y[:-n]
print(X)

[[2.02009270e+07 1.04319470e-02 3.33333333e-01 2.51670562e+06]
 [2.02009280e+07 1.23368997e-02 3.33333333e-01 1.18549411e+06]
 [2.02009290e+07 3.80490609e-02 1.00000000e+00 2.39069336e+06]
 ...
 [2.02201250e+07 1.41048466e-02 3.33333333e-01 1.59809478e+05]
 [2.02201260e+07 3.94761199e-02 1.00000000e+00 3.07859668e+05]
 [2.02201290e+07 4.10008234e-02 1.00000000e+00 2.08660167e+05]]


In [42]:
# Split training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3,shuffle=False,random_state = 0)

In [43]:
X_test = pd.DataFrame(X_test)
#X_test_new = pd.DataFrame(X_test_new)
X_train = pd.DataFrame(X_train)
dates = X_test[0]
#dates = X_test_new[0]
X_test

Unnamed: 0,0,1,2,3
0,20210911.0,-0.006562,-0.333333,238067.13230
1,20210912.0,-0.006689,-0.333333,85370.45770
2,20210913.0,0.001269,-0.333333,28288.59838
3,20210914.0,-0.002815,-0.333333,42554.10737
4,20210915.0,-0.014081,-1.000000,26257.37735
...,...,...,...,...
93,20220123.0,-0.041409,-1.000000,131231.54500
94,20220124.0,-0.014752,-0.333333,172048.40910
95,20220125.0,0.014105,0.333333,159809.47820
96,20220126.0,0.039476,1.000000,307859.66760


In [44]:
X_test = X_test.iloc[:, 1:4]
#X_test_new = X_test_new.iloc[:, 1:4]
X_train = X_train.iloc[:,1:4]
X_test

Unnamed: 0,1,2,3
0,-0.006562,-0.333333,238067.13230
1,-0.006689,-0.333333,85370.45770
2,0.001269,-0.333333,28288.59838
3,-0.002815,-0.333333,42554.10737
4,-0.014081,-1.000000,26257.37735
...,...,...,...
93,-0.041409,-1.000000,131231.54500
94,-0.014752,-0.333333,172048.40910
95,0.014105,0.333333,159809.47820
96,0.039476,1.000000,307859.66760


In [45]:
X_test = np.array(X_test)
#X_test_new = np.array(X_test_new)
X_test

array([[-6.56217409e-03, -3.33333333e-01,  2.38067132e+05],
       [-6.68855291e-03, -3.33333333e-01,  8.53704577e+04],
       [ 1.26888663e-03, -3.33333333e-01,  2.82885984e+04],
       [-2.81504646e-03, -3.33333333e-01,  4.25541074e+04],
       [-1.40814912e-02, -1.00000000e+00,  2.62573773e+04],
       [-9.88988957e-03, -3.33333333e-01,  9.38808622e+04],
       [-1.43225289e-02, -3.33333333e-01,  3.24748774e+04],
       [-6.35144230e-03,  3.33333333e-01,  2.24557559e+05],
       [ 6.46497289e-03,  3.33333333e-01,  5.35674661e+04],
       [ 1.54437684e-02,  3.33333333e-01,  4.46668994e+04],
       [ 9.44613991e-03, -3.33333333e-01,  3.15621657e+05],
       [ 6.29624668e-03, -3.33333333e-01,  1.16595177e+05],
       [ 2.50182012e-02,  3.33333333e-01,  5.60927667e+04],
       [ 4.39996033e-02,  1.00000000e+00,  1.74758574e+05],
       [ 3.17986215e-02,  1.00000000e+00,  3.17025549e+05],
       [ 1.55150966e-02,  3.33333333e-01,  7.75799386e+05],
       [-1.50522132e-02, -3.33333333e-01

In [46]:
X_train = np.array(X_train)

In [47]:
X_test = preprocessing.scale(X_test)
#X_test_new = preprocessing.scale(X_test_new)
X_train = preprocessing.scale(X_train)

In [48]:
# Construct and build classifier
clf = svm.SVC(kernel='rbf', gamma='scale')
clf.fit(X_train, Y_train)

SVC()

In [49]:
# Calculate accuracy
score = clf.score(X_test, Y_test)
#score2 = clf.score(X_test_new, Y_test_new)
print("score = %f" %score)
#print("score = %f" %score2)

score = 0.622449


In [50]:
predicted = pd.DataFrame([clf.predict(X_test),Y_test])
predicted = predicted.transpose()
predicted = predicted.rename(columns={0:'Predicted',1:'True'})

In [51]:
predicted['Dates'] = dates
predicted = predicted [['Dates','Predicted','True']]

In [52]:
predicted.to_csv('Combo_indicator.csv', index = False)
print("score = %f" %score)

score = 0.622449
