In [41]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [42]:
df=pd.read_csv("NSE-Tata-Global-Beverages-Limited.csv")

In [43]:
df.shape #dimensions of dataset

(2035, 8)

In [44]:
df.describe() #stats like mean, min, max, etc

Unnamed: 0,Open,High,Low,Last,Close,Total Trade Quantity,Turnover (Lacs)
count,2035.0,2035.0,2035.0,2035.0,2035.0,2035.0,2035.0
mean,149.713735,151.992826,147.293931,149.474251,149.45027,2335681.0,3899.980565
std,48.664509,49.413109,47.931958,48.73257,48.71204,2091778.0,4570.767877
min,81.1,82.8,80.0,81.0,80.95,39610.0,37.04
25%,120.025,122.1,118.3,120.075,120.05,1146444.0,1427.46
50%,141.5,143.4,139.6,141.1,141.25,1783456.0,2512.03
75%,157.175,159.4,155.15,156.925,156.9,2813594.0,4539.015
max,327.7,328.75,321.65,325.95,325.75,29191020.0,55755.08


In [45]:
df.head() #dataset features and attribute information

Unnamed: 0,Date,Open,High,Low,Last,Close,Total Trade Quantity,Turnover (Lacs)
0,2018-09-28,234.05,235.95,230.2,233.5,233.75,3069914,7162.35
1,2018-09-27,234.55,236.8,231.1,233.8,233.25,5082859,11859.95
2,2018-09-26,240.0,240.0,232.5,235.0,234.25,2240909,5248.6
3,2018-09-25,233.3,236.75,232.0,236.25,236.1,2349368,5503.9
4,2018-09-24,233.55,239.2,230.75,234.0,233.3,3423509,7999.55


In [46]:
# Drop the 'Date' column as it's not required for prediction
df.drop('Date', axis=1, inplace=True)

In [47]:
#Assign feature variable to df_x
df_x = df.drop(['Close'], axis=1)
print(df_x)

        Open    High     Low    Last  Total Trade Quantity  Turnover (Lacs)
0     234.05  235.95  230.20  233.50               3069914          7162.35
1     234.55  236.80  231.10  233.80               5082859         11859.95
2     240.00  240.00  232.50  235.00               2240909          5248.60
3     233.30  236.75  232.00  236.25               2349368          5503.90
4     233.55  239.20  230.75  234.00               3423509          7999.55
...      ...     ...     ...     ...                   ...              ...
2030  117.60  119.50  112.00  118.80                586100           694.98
2031  120.10  121.00  117.10  117.10                658440           780.01
2032  121.80  121.95  120.25  120.35                281312           340.31
2033  120.30  122.00  120.25  120.75                293312           355.17
2034  122.10  123.00  121.05  121.10                658666           803.56

[2035 rows x 6 columns]


In [48]:
#Assign price variable to df_y
df_y = df['Close']
print(df_y)

0       233.75
1       233.25
2       234.25
3       236.10
4       233.30
         ...  
2030    118.65
2031    117.60
2032    120.65
2033    120.90
2034    121.55
Name: Close, Length: 2035, dtype: float64


In [49]:
#Initialize the linear regression model
reg = linear_model.LinearRegression()

In [50]:
#Split the data into 67% and 33% testing data
#We split the dependent variables (x) and the target or independent variable (y)
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.33, random_state=42)
# random state ensures that the split is reproducible, meaning that the same split will be obtained every time the code is run with this seed value. Just a convention to use no. 42

In [51]:
#Train our model with the training data 
reg.fit(x_train, y_train)

In [52]:
#Print the coefficients/ weights for each feature/column of our model 
print("Feature Coefficients:", reg.coef_) #weights for each feature
#A higher absolute value of a coefficient indicates a stronger influence of that feature on the target variable. Positive coefficients suggest a direct relationship, where an increase in the feature value leads to an increase in the target variable, while negative coefficients indicate an inverse relationship.

Feature Coefficients: [-6.17979411e-02  9.57709124e-02  9.71104304e-02  8.68800728e-01
  1.63257605e-08 -9.12252209e-06]


In [53]:
#print our price predicitions on our test data 
y_pred = reg.predict(x_test)
print(y_pred)

[118.6059575  285.55357646 147.10068038 132.0851074  131.2898135
 160.15767388 119.53484972 130.2657277  154.20188717 196.83785894
 164.55291281 138.88472358 142.5732889  264.55640102 146.70222501
 274.37635221  89.9741754  122.38005738  88.73472375 257.84749341
 140.97171252 133.35443772  85.69034761 143.17981122  98.58076237
 150.67991625 127.54976575  97.54487782 158.02205365 128.86352318
 155.73931391 136.83959078 119.55483925 153.41007759 156.13427726
 237.36394361 115.20737995  83.35680408 154.75452305 115.87665653
 144.85596935 163.62042597 157.68701734 152.06297366 142.05845343
 196.69913608 146.3575732  168.98540573 154.80443688 109.38230312
  93.18123237 147.0815683  266.53743037 102.91895127 116.07713776
 173.86358549 143.97782609 101.54002215 168.80087305 109.12510486
 156.03713511 140.06881174 275.86972027 156.27309617 144.00615432
 127.24855158 152.83290125 274.6710212  121.90614438 154.12277905
 135.87573984 100.61154325 154.81811318 162.45669761 158.11475977
 135.878696

In [54]:
#print the predicted price and actual price of houses from the testing data set row 0
print("Predicted Close Price:", y_pred[0])
print("Actual Close Price:", y_test.iloc[0])

Predicted Close Price: 118.6059574979434
Actual Close Price: 118.35


In [55]:
#Model is pretty good, but to check model's performance and accuracy, we will use mean squared error (MSE)
mse = np.mean((y_pred - y_test) ** 2)
print("Mean Squared Error:", mse)
#It measures the average of the squares of the errors, which are the differences between the predicted values (y_pred) and the actual values (y_test).
#By squaring these residuals, we ensure that all error values are positive and that larger errors have a disproportionately higher impact on the final metric.

#Lower values of MSE indicate better model performance. However, it is important to compare the MSE to the range of the target variable.


Mean Squared Error: 0.15578184169333248


In [56]:
predictions = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(predictions)

      Actual   Predicted
611   118.35  118.605957
111   286.20  285.553576
1185  147.00  147.100680
1510  131.80  132.085107
771   131.00  131.289814
...      ...         ...
1391  142.05  142.466086
834   145.95  146.045082
118   275.50  275.761520
1269  141.90  141.836845
1265  147.65  147.878799

[672 rows x 2 columns]
