In [69]:
#import modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [70]:
#import dataset
apple_data = pd.read_csv("./Datasets/Historical/HistoricalData_APPLE.csv", parse_dates=["Date"], index_col="Date")
apple_data.index = pd.to_datetime(apple_data.index,format='%Y-%m-%d')
apple_data = apple_data.sort_index(ascending=True, axis=0)
apple_data.head()

Unnamed: 0_level_0,Close/Last,Volume,Open,High,Low
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2011-06-10,$11.6393,433801306,$11.8054,$11.845,$11.6254
2011-06-13,$11.6643,329376468,$11.6857,$11.7254,$11.6096
2011-06-14,$11.8729,333995906,$11.7857,$11.9018,$11.7611
2011-06-15,$11.6696,395841722,$11.7768,$11.7964,$11.6029
2011-06-16,$11.6129,507299317,$11.675,$11.7386,$11.3689


In [71]:
apple_data.shape

(2516, 5)

In [72]:
#Edit dataset to remove $ and convert to float
apple_data['Open'] = apple_data['Open'].apply(lambda x: x.replace('$',''))
apple_data['Open'] = apple_data['Open'].astype(float)

apple_data['Close/Last'] = apple_data['Close/Last'].apply(lambda x: x.replace('$',''))
apple_data['Close/Last'] = apple_data['Close/Last'].astype(float)

apple_data['High'] = apple_data['High'].apply(lambda x: x.replace('$',''))
apple_data['High'] = apple_data['High'].astype(float)

apple_data['Low'] = apple_data['Low'].apply(lambda x: x.replace('$',''))
apple_data['Low'] = apple_data['Low'].astype(float)

In [73]:
apple_data.describe()

Unnamed: 0,Close/Last,Volume,Open,High,Low
count,2516.0,2516.0,2516.0,2516.0,2516.0
mean,41.277389,238882600.0,41.268871,41.69847,40.829849
std,30.304841,191459400.0,30.319067,30.698585,29.893474
min,11.2614,45448200.0,11.31,11.3464,11.0893
25%,21.21555,109265800.0,21.2441,21.4236,20.9928
50%,29.5925,167933400.0,29.505,29.81,29.28125
75%,47.73375,302247700.0,47.770625,47.99625,47.37125
max,143.16,1498071000.0,143.6,145.09,141.37


In [74]:
#find the difference in the Close/Last prices
apple_data['Close_difference'] = ''
for i in range(0, len(apple_data) - 1):
  apple_data['Close_difference'][i+1] = apple_data['Close/Last'][i+1] - apple_data['Close/Last'][i]
apple_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  apple_data['Close_difference'][i+1] = apple_data['Close/Last'][i+1] - apple_data['Close/Last'][i]


Unnamed: 0_level_0,Close/Last,Volume,Open,High,Low,Close_difference
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2011-06-10,11.6393,433801306,11.8054,11.8450,11.6254,
2011-06-13,11.6643,329376468,11.6857,11.7254,11.6096,0.025
2011-06-14,11.8729,333995906,11.7857,11.9018,11.7611,0.2086
2011-06-15,11.6696,395841722,11.7768,11.7964,11.6029,-0.2033
2011-06-16,11.6129,507299317,11.6750,11.7386,11.3689,-0.0567
...,...,...,...,...,...,...
2021-06-03,123.5400,76229170,124.6800,124.8500,123.1300,-1.52
2021-06-04,125.8900,75169340,124.0700,126.1600,123.8500,2.35
2021-06-07,125.9000,71057550,126.1700,126.3200,124.8321,0.01
2021-06-08,126.7400,74403770,126.6000,128.4600,126.2101,0.84


In [75]:
apple_data.dtypes

Close/Last          float64
Volume                int64
Open                float64
High                float64
Low                 float64
Close_difference     object
dtype: object

In [76]:
apple_data['Close_difference'] = pd.to_numeric(apple_data['Close_difference'])

In [77]:
#Get impact of the close price differences i.e., if close difference price>0, Impact=0 else if close difference price<0, Impact=0
def get_Impact(close_price): 
    if close_price > 0: 
        return 1
    else: 
        return 0
apple_data['Impact']=apple_data.apply(lambda x: get_Impact(x['Close_difference']), axis=1)
apple_data

Unnamed: 0_level_0,Close/Last,Volume,Open,High,Low,Close_difference,Impact
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011-06-10,11.6393,433801306,11.8054,11.8450,11.6254,,0
2011-06-13,11.6643,329376468,11.6857,11.7254,11.6096,0.0250,1
2011-06-14,11.8729,333995906,11.7857,11.9018,11.7611,0.2086,1
2011-06-15,11.6696,395841722,11.7768,11.7964,11.6029,-0.2033,0
2011-06-16,11.6129,507299317,11.6750,11.7386,11.3689,-0.0567,0
...,...,...,...,...,...,...,...
2021-06-03,123.5400,76229170,124.6800,124.8500,123.1300,-1.5200,0
2021-06-04,125.8900,75169340,124.0700,126.1600,123.8500,2.3500,1
2021-06-07,125.9000,71057550,126.1700,126.3200,124.8321,0.0100,1
2021-06-08,126.7400,74403770,126.6000,128.4600,126.2101,0.8400,1


In [78]:
apple_data.drop(apple_data.index[[0]], inplace=True)
apple_data

Unnamed: 0_level_0,Close/Last,Volume,Open,High,Low,Close_difference,Impact
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011-06-13,11.6643,329376468,11.6857,11.7254,11.6096,0.0250,1
2011-06-14,11.8729,333995906,11.7857,11.9018,11.7611,0.2086,1
2011-06-15,11.6696,395841722,11.7768,11.7964,11.6029,-0.2033,0
2011-06-16,11.6129,507299317,11.6750,11.7386,11.3689,-0.0567,0
2011-06-17,11.4379,614859874,11.7496,11.7589,11.4057,-0.1750,0
...,...,...,...,...,...,...,...
2021-06-03,123.5400,76229170,124.6800,124.8500,123.1300,-1.5200,0
2021-06-04,125.8900,75169340,124.0700,126.1600,123.8500,2.3500,1
2021-06-07,125.9000,71057550,126.1700,126.3200,124.8321,0.0100,1
2021-06-08,126.7400,74403770,126.6000,128.4600,126.2101,0.8400,1


In [79]:
# apple_data.to_csv("Impacted_APPLE.csv")

In [80]:
X=apple_data.iloc[:,0:5]
Y=apple_data['Impact']

In [81]:
#split the dataset into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

In [82]:
#create model
log_model = LogisticRegression()
log_model = log_model.fit(X_train,Y_train)

In [83]:
#find the probability
probability = log_model.predict_proba(X_test)
probability

array([[0.4968413 , 0.5031587 ],
       [0.49415745, 0.50584255],
       [0.4959823 , 0.5040177 ],
       ...,
       [0.49412509, 0.50587491],
       [0.49703394, 0.50296606],
       [0.49015058, 0.50984942]])

In [84]:
#predict Y
Y_predict = log_model.predict(X_test)

[[  0 252]
 [  0 251]]


In [86]:
#confusion matrix, accuracy, classification report
print(metrics.confusion_matrix(Y_test,Y_predict))
print(metrics.accuracy_score(Y_test,Y_predict))
print(metrics.classification_report(Y_test, Y_predict))

[[  0 252]
 [  0 251]]
0.4990059642147117
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       252
           1       0.50      1.00      0.67       251

    accuracy                           0.50       503
   macro avg       0.25      0.50      0.33       503
weighted avg       0.25      0.50      0.33       503

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [87]:
#Hyperparameter tuning
clf = LogisticRegression(solver='saga', max_iter=800, random_state=0)
param_grid = { 'penalty': ['none', 'l1', 'l2'], 'C': [0.05, 0.1, 0.5, 1, 5] }
grid_search = GridSearchCV(clf, param_grid=param_grid)
grid_search.fit(X, Y)
result = grid_search.cv_results_




In [88]:
print("Tuned Logistic Regression Parameters: {}".format(grid_search.best_params_)) 
print("Best score is {}".format(grid_search.best_score_))

Tuned Logistic Regression Parameters: {'C': 0.05, 'penalty': 'none'}
Best score is 0.5149105367793241
