In [17]:
import pandas as pd
import numpy as np
import datetime
import yfinance as yf
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('../data/historical-data/TATAMOTORS.csv')
data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,1995-12-25,53.472778,53.472778,53.472778,53.472778,0,0.0,0.0
1,1995-12-26,53.472778,53.472778,53.472778,53.472778,0,0.0,0.0
2,1995-12-27,53.472778,53.472778,53.472778,53.472778,0,0.0,0.0
3,1995-12-28,53.472778,53.472778,53.472778,53.472778,0,0.0,0.0
4,1995-12-29,53.472778,53.472778,53.472778,53.472778,0,0.0,0.0


In [3]:
# Extract the closing price as the target variable and create a new DataFrame with only the closing price
target_variable = 'Close'
close_prices = pd.DataFrame(data[target_variable], columns=[target_variable])

In [4]:
close_prices

Unnamed: 0,Close
0,53.472778
1,53.472778
2,53.472778
3,53.472778
4,53.472778
...,...
6831,439.899994
6832,443.000000
6833,436.500000
6834,429.450012


In [5]:
# Create a new DataFrame with the previous 5 days' closing prices as features and the target variable as the next day's closing price
window_size = 5
for i in range(1, window_size + 1):
    close_prices['lag{}'.format(i)] = close_prices[target_variable].shift(i)
close_prices.dropna(inplace=True)

In [6]:
close_prices

Unnamed: 0,Close,lag1,lag2,lag3,lag4,lag5
5,53.472778,53.472778,53.472778,53.472778,53.472778,53.472778
6,53.163197,53.472778,53.472778,53.472778,53.472778,53.472778
7,52.987293,53.163197,53.472778,53.472778,53.472778,53.472778
8,52.586262,52.987293,53.163197,53.472778,53.472778,53.472778
9,52.396286,52.586262,52.987293,53.163197,53.472778,53.472778
...,...,...,...,...,...,...
6831,439.899994,441.600006,444.149994,440.549988,441.049988,445.850006
6832,443.000000,439.899994,441.600006,444.149994,440.549988,441.049988
6833,436.500000,443.000000,439.899994,441.600006,444.149994,440.549988
6834,429.450012,436.500000,443.000000,439.899994,441.600006,444.149994


In [7]:
# Split the data into a training set and a testing set. We'll use the first 80% of the data for training and the remaining 20% for testing
train_size = int(len(close_prices) * 0.8)
train_data = close_prices[:train_size]
test_data = close_prices[train_size:]

In [8]:
# Extract the features and target variable from the training and testing data
X_train = train_data.iloc[:, 1:]
y_train = train_data[target_variable]
X_test = test_data.iloc[:, 1:]
y_test = test_data[target_variable]

In [9]:
# Create the Linear Regression model and fit it to the training data
model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

In [10]:
# Make predictions on the testing data and evaluate the accuracy of the model
predictions = model.predict(X_test)
accuracy = model.score(X_test, y_test)
print('Accuracy:', accuracy)

Accuracy: 0.9961802229027109


In [11]:
# Load new data for the next day and extract the previous 5 days' closing prices
new_data = data.iloc[-100:-10]
previous_close_prices = new_data[target_variable][-window_size:]

# Make a prediction for the next day's closing price using the Linear Regression model and the previous 5 days' closing prices
next_day_price_prediction = model.predict([previous_close_prices])
print('Predicted price for next day:', next_day_price_prediction[0])

Predicted price for next day: 445.15984902119




In [12]:
def predict(data):
    target_variable = 'Close'
    window_size = 5
    prevClose = data[target_variable][-window_size:]
    
    pred = model.predict([prevClose])
    return pred[0]

In [22]:
stock = yf.Ticker("TATAMOTORS.NS")
df = stock.history(start=datetime.date.today() - datetime.timedelta(days=6),
                    end=datetime.date.today()
)
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-02-27,427.75,428.049988,413.049988,417.950012,10438928,0,0
2023-02-28,419.0,424.799988,419.0,420.700012,9652088,0,0
2023-03-01,421.5,428.0,421.5,426.0,7595114,0,0
2023-03-02,426.0,426.25,419.549988,420.450012,5661103,0,0
2023-03-03,422.600006,430.5,421.950012,428.0,8027522,0,0


In [30]:
df = stock.history(start=datetime.date.today() - datetime.timedelta(days=8),
                    end=datetime.date.today() - datetime.timedelta(days=1)
)
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-02-24,435.399994,436.700012,427.0,427.75,7895024,0,0
2023-02-27,427.75,428.049988,413.049988,417.950012,10438928,0,0
2023-02-28,419.0,424.799988,419.0,420.700012,9652088,0,0
2023-03-01,421.5,428.0,421.5,426.0,7595114,0,0
2023-03-02,426.0,426.25,419.549988,420.450012,5661103,0,0


In [31]:
print(predict(df))

428.1013686147329




In [35]:
import joblib

joblib.dump(model, '../models/historical-data-TATAMOTORS-NS.h5')

['../models/historical-data-TATAMOTORS-NS.h5']