# Importing the required libraries

In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

## Reading the dataset

In [57]:
data=pd.read_csv("Stocks_Dataset.csv", index_col='Date')
data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1988-01-04,1952.589966,2030.01001,1950.76001,2015.25,2015.25,20880000
1988-01-05,2056.370117,2075.27002,2021.390015,2031.5,2031.5,27200000
1988-01-06,2036.469971,2058.189941,2012.77002,2037.800049,2037.800049,18800000
1988-01-07,2019.890015,2061.51001,2004.640015,2051.889893,2051.889893,21370000
1988-01-08,2046.579956,2058.689941,1898.040039,1911.310059,1911.310059,27440000


In [58]:
data.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-12-23,19908.609375,19934.150391,19899.060547,19933.810547,19933.810547,158260000
2016-12-27,19943.460938,19980.240234,19939.800781,19945.039063,19945.039063,158540000
2016-12-28,19964.310547,19981.109375,19827.310547,19833.679688,19833.679688,188350000
2016-12-29,19835.460938,19878.439453,19788.939453,19819.779297,19819.779297,172040000
2016-12-30,19833.169922,19852.550781,19718.669922,19762.599609,19762.599609,271910000


In [59]:
data.shape  # shape of the dataset

(7310, 6)

In [60]:
data.info()  # observing the type of each column and no. of values available for each column

<class 'pandas.core.frame.DataFrame'>
Index: 7310 entries, 1988-01-04 to 2016-12-30
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       7310 non-null   float64
 1   High       7310 non-null   float64
 2   Low        7310 non-null   float64
 3   Close      7310 non-null   float64
 4   Adj Close  7310 non-null   float64
 5   Volume     7310 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 399.8+ KB


In [61]:
data.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,7310.0,7310.0,7310.0,7310.0,7310.0,7310.0
mean,9133.807065,9191.949688,9074.398514,9136.705523,9136.705523,133186000.0
std,4609.854291,4630.332716,4588.541131,4610.672687,4610.672687,108246500.0
min,1892.410034,1903.349976,1845.98999,1879.140015,1879.140015,5170000.0
25%,4100.525086,4143.682373,4093.277405,4141.954956,4141.954956,30842500.0
50%,9969.194824,10046.060059,9894.884765,9973.145019,9973.145019,105545000.0
75%,11947.520019,12031.347412,11846.589599,11952.704834,11952.704834,214190000.0
max,19968.970703,19987.630859,19941.960938,19974.619141,19974.619141,738440000.0


In [62]:
data.isnull()  # checking the null values

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1988-01-04,False,False,False,False,False,False
1988-01-05,False,False,False,False,False,False
1988-01-06,False,False,False,False,False,False
1988-01-07,False,False,False,False,False,False
1988-01-08,False,False,False,False,False,False
...,...,...,...,...,...,...
2016-12-23,False,False,False,False,False,False
2016-12-27,False,False,False,False,False,False
2016-12-28,False,False,False,False,False,False
2016-12-29,False,False,False,False,False,False


In [63]:
data.isnull().sum()   # calculating the total no. of null values

Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [64]:
data.duplicated()  # checking the duplicate values

Date
1988-01-04    False
1988-01-05    False
1988-01-06    False
1988-01-07    False
1988-01-08    False
              ...  
2016-12-23    False
2016-12-27    False
2016-12-28    False
2016-12-29    False
2016-12-30    False
Length: 7310, dtype: bool

In [65]:
data.duplicated().sum()  # calculating the total no. of duplicate values

0

## Applying feature engineering and creating the new dataset with new features

In [66]:
def generate_features(df):
    df_new=pd.DataFrame()
    df_new['open']=df['Open']
    df_new['open_1']=df['Open'].shift(1)
    df_new['close_1']=df['Close'].shift(1)
    df_new['high_1']=df['High'].shift(1)
    df_new['low_1']=df['Low'].shift(1)
    df_new['volume_1']=df['Volume'].shift(1)
    # average price
    df_new['avg_price_5']= df['Close'].rolling(5).mean().shift(1)
    df_new['avg_price_30']= df['Close'].rolling(21).mean().shift(1)
    df_new['avg_price_365']= df['Close'].rolling(252).mean().shift(1)
    df_new['ratio_avg_price_5_30']= df_new['avg_price_5']/df_new['avg_price_30']
    df_new['ratio_avg_price_5_365']= df_new['avg_price_5']/df_new['avg_price_365']
    df_new['ratio_avg_price_30_365']= df_new['avg_price_30']/df_new['avg_price_365']
    # average volume
    df_new['avg_volume_5']= df['Volume'].rolling(5).mean().shift(1)
    df_new['avg_volume_30']= df['Volume'].rolling(21).mean().shift(1)
    df_new['avg_volume_365']= df['Volume'].rolling(252).mean().shift(1)
    df_new['ratio_avg_volume_5_30']= df_new['avg_volume_5']/df_new['avg_volume_30']
    df_new['ratio_avg_volume_5_365']= df_new['avg_volume_5']/df_new['avg_volume_365']
    df_new['ratio_avg_volume30_365']= df_new['avg_volume_30']/df_new['avg_volume_365']
    #standard deviation of prices
    df_new['std_price_5']= df['Close'].rolling(5).std().shift(1)
    df_new['std_price_30']= df['Close'].rolling(21).std().shift(1)
    df_new['std_price_365']= df['Close'].rolling(252).std().shift(1)
    df_new['ratio_std_price_5_30']= df_new['std_price_5']/df_new['std_price_30']
    df_new['ratio_std_price_5_365']= df_new['std_price_5']/df_new['std_price_365']
    df_new['ratio_std_price_30_365']= df_new['std_price_30']/df_new['std_price_365']
    # standard deviation of volume
    df_new['std_volume_5']= df['Volume'].rolling(5).std().shift(1)
    df_new['std_volume_30']= df['Volume'].rolling(21).std().shift(1)
    df_new['std_volume_365']= df['Volume'].rolling(252).std().shift(1)
    df_new['ratio_std_volume_5_30']= df_new['std_volume_5']/df_new['std_volume_30']
    df_new['ratio_std_volume_5_365']= df_new['std_volume_5']/df_new['std_volume_365']
    df_new['ratio_std_volume_30_365']= df_new['std_volume_30']/df_new['std_volume_365']
    # the target
    df_new['close']=df['Close']
    df_new=df_new.dropna(axis=0)
    return df_new

In [67]:
data_new = generate_features(data)  
data_new.head()

Unnamed: 0_level_0,open,open_1,close_1,high_1,low_1,volume_1,avg_price_5,avg_price_30,avg_price_365,ratio_avg_price_5_30,...,ratio_std_price_5_30,ratio_std_price_5_365,ratio_std_price_30_365,std_volume_5,std_volume_30,std_volume_365,ratio_std_volume_5_30,ratio_std_volume_5_365,ratio_std_volume_30_365,close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1988-12-30,2183.389893,2169.110107,2182.679932,2193.040039,2165.179932,12220000.0,2168.215967,2146.005697,2061.050753,1.01035,...,0.372085,0.123335,0.331471,4260343.0,4572098.0,6461774.0,0.931814,0.659315,0.707561,2168.570068
1989-01-03,2163.209961,2183.389893,2168.570068,2193.75,2162.5,11140000.0,2169.857959,2148.579985,2061.659166,1.009903,...,0.332631,0.106706,0.320794,2605936.0,4627472.0,6476618.0,0.563145,0.402361,0.714489,2144.639893
1989-01-04,2153.75,2163.209961,2144.639893,2168.389893,2127.139893,17310000.0,2164.999951,2150.616176,2062.108134,1.006688,...,0.677084,0.191809,0.283287,3633685.0,4665697.0,6453401.0,0.778809,0.563065,0.722983,2177.679932
1989-01-05,2184.290039,2153.75,2177.679932,2183.389893,2146.610107,15710000.0,2167.999951,2154.682838,2062.663213,1.006181,...,0.914097,0.204474,0.22369,3057422.0,4671999.0,6455623.0,0.654414,0.473606,0.72371,2190.540039
1989-01-06,2195.889893,2184.290039,2190.540039,2205.179932,2173.040039,20310000.0,2172.821973,2157.86284,2063.213412,1.006932,...,1.089841,0.24502,0.224822,3744766.0,4825499.0,6454034.0,0.776037,0.580221,0.747672,2194.290039


In [68]:
start_train="1988-01-01"
end_train="2015-12-31"

start_test="2016-01-01"
end_test="2016-12-31"

### Defining the training data

In [70]:
data_train= data_new[start_train:end_train]

X_train= data_train.drop('close',axis=1).values
y_train= data_train['close'].values

In [71]:
print(X_train.shape)
print(y_train.shape)

(6806, 30)
(6806,)


### Defining the test data

In [73]:
data_test= data_new[start_test:end_test]

X_test= data_test.drop('close',axis=1).values
y_test= data_test['close'].values

In [132]:
X_test.shape

(252, 30)

In [133]:
y_test.shape

(252,)

# Linear Regression Algorithms 

## Using SGDRegressor 

In [74]:
scaler=StandardScaler()

X_scaled_train= scaler.fit_transform(X_train)
X_scaled_test= scaler.transform(X_test)

#defining the hyperparameters for the model
param_grid={
    "alpha":[1e-5, 3e-5,1e-4],    #alpha parameter is for regularization
    "eta0":[0.01,0.03,0.1]        #eta parameter is learning rate
}

from sklearn.linear_model import SGDRegressor
lr=SGDRegressor(penalty="l2", max_iter=100)
grid_search= GridSearchCV(lr, param_grid, cv=5, scoring="r2") #gridsearchcv is used for identifying the best parameter
grid_search.fit(X_scaled_train,y_train)

print(grid_search.best_params_)



{'alpha': 1e-05, 'eta0': 0.1}


In [75]:
lr_best= grid_search.best_estimator_   #taking the best parameter with the help of gridsearchcv

predictions_lr=lr_best.predict(X_scaled_test)    #predicting the results for test set

### MSE and MAE errors in the model

In [81]:
print('Mean Squared Error: {0:.3f}'.format(mean_squared_error(y_test, predictions_lr)))
print('Mean Absolute Error: {0:.3f}'.format(mean_absolute_error(y_test, predictions_lr)))

Mean Squared Error: 20969.691
Mean Absolute Error: 104.995


### Accuracy of SGDRegressor

In [80]:
print('Accuracy of the model: {0:.3f}'.format(r2_score(y_test, predictions_lr)))  

Accuracy of the model: 0.976


# Random Forest Algorithm

In [86]:
param_grid={
    'max_depth':[5,10,15,20,30],
    'min_samples_split':[5,15,20],
    'max_features':['auto','sqrt'],
    'min_samples_leaf':[2,3]
}

In [90]:
from sklearn.ensemble import RandomForestRegressor

rf=RandomForestRegressor(n_estimators=100, n_jobs=-1)
grid_search= GridSearchCV(rf, param_grid, cv=5, scoring="r2", n_jobs=-1)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

rf_best= grid_search.best_estimator_

predictions_rf= rf_best.predict(X_test)

{'max_depth': 30, 'max_features': 'auto', 'min_samples_leaf': 3, 'min_samples_split': 5}


### MSE and MAE errors in the model

In [89]:
print('Mean Squared Error: {0:.3f}'.format(mean_squared_error(y_test, predictions_lr)))
print('Mean Absolute Error: {0:.3f}'.format(mean_absolute_error(y_test, predictions_lr)))

Mean Squared Error: 20969.691
Mean Absolute Error: 104.995


### Accuracy of the random forest model

In [88]:
print('Accuracy of the model: {0:.3f}'.format(r2_score(y_test, predictions_lr)))

Accuracy of the model: 0.976


# SVM Regressor

In [121]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
svm_regr = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.2))
#svm_regr = SVR(C=1.0, epsilon=0.2)
svm_regr.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svr', SVR(epsilon=0.2))])

In [128]:
pred = svm_regr.predict(X_test)

In [116]:
#svm_regr.score(X_scaled_test, y_test)

-111.39930063379612

In [129]:
print('Mean Squared Error: {0:.3f}'.format(mean_squared_error(y_test, pred)))
print('Mean Absolute Error: {0:.3f}'.format(mean_absolute_error(y_test, pred)))

Mean Squared Error: 62622139.371
Mean Absolute Error: 7846.688


### The accuracy score is negative which means that SVR model is not at all good for the prediction of our dataset

### Therefore we will go with the SGDRegressor and Random Forest model for our stock price prediction

In [130]:
print('Accuracy of the model: {0:.3f}'.format(r2_score(y_test, pred)))

Accuracy of the model: -69.739
