### Tesla Stock Price Prediction by regression

In [42]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error , r2_score ,roc_curve, auc ,classification_report
from sklearn.pipeline import Pipeline  
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Lasso ,Ridge ,LassoCV,RidgeCV , LinearRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import PolynomialFeatures

from matplotlib import pyplot as plt 
import seaborn as sns

In [44]:
df = pd.read_csv("Tesla stock  price prediction.csv")
print(df.shape)
df.head()

(1692, 7)


Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,6/29/2010,19.0,25.0,17.540001,23.889999,18766300,23.889999
1,6/30/2010,25.790001,30.42,23.299999,23.83,17187100,23.83
2,7/1/2010,25.0,25.92,20.27,21.959999,8218800,21.959999
3,7/2/2010,23.0,23.1,18.709999,19.200001,5139800,19.200001
4,7/6/2010,20.0,20.0,15.83,16.110001,6866900,16.110001


In [45]:
df['Date'] = pd.to_datetime(df['Date'],format='%m/%d/%Y')
df.dtypes

Date         datetime64[ns]
Open                float64
High                float64
Low                 float64
Close               float64
Volume                int64
Adj Close           float64
dtype: object

If we observe carefully we can see that the data in the 'Close' column and that available in the 'Adj Close' column is the same let's check whether this is the case with each row or not.

In [46]:
df[df['Close'] == df['Adj Close']].shape

(1692, 7)

In [47]:
df.drop('Adj Close',axis=1,inplace=True)

In [48]:
df.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume'], dtype='object')

### Feature Engineering

In [49]:
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day
df['Isweekend'] = (df['Date'].dt.dayofweek >=5).astype(int)

In [50]:
df['past_3_days_avg_price'] = df['Close'].rolling(window=3).mean()

# the average of the past 3 days including the current day.

# df['target'] = df['Close'].shift(1).rolling(window=3).mean()
# the average of the 3 days before today (i.e., excluding today)

In [51]:
df['is_quarter_end'] = np.where(df['month']%3 ==0,  1 , 0)

# The quarter end months are March, June, Sept, and Dec ie; 3, 6, 9, 12 
# ie; last months in 4 quarters as financial year is divided into 4 quarters


In [52]:
df.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'year', 'month',
       'day', 'Isweekend', 'past_3_days_avg_price', 'is_quarter_end'],
      dtype='object')

In [53]:
df.groupby('is_quarter_end')[['Open', 'High', 'Low', 'Close', 'Volume']].mean()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
is_quarter_end,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,130.813739,133.18262,128.257229,130.797709,4461581.0
1,135.679982,137.927032,133.455777,135.673269,3891084.0


Here are some of the important observations of the above-grouped data:
- Prices are higher in the months which are quarter end as compared to that of the non-quarter end months.
- The volume of trades is lower [Less trading(ie;buying or selling)activity overall ] in the months which are quarter end.

In [54]:
df['target'] = df['Close'].shift(-1)    # Target (predict next day's close)

#This sets the next day's closing price as the target 
# (i.e., we're trying to predict the closing price for the next day).


In [55]:
df = df.dropna().reset_index(drop='index')
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,year,month,day,Isweekend,past_3_days_avg_price,is_quarter_end,target
0,2010-07-01,25.0,25.92,20.27,21.959999,8218800,2010,7,1,0,23.226666,0,19.200001
1,2010-07-02,23.0,23.1,18.709999,19.200001,5139800,2010,7,2,0,21.663333,0,16.110001
2,2010-07-06,20.0,20.0,15.83,16.110001,6866900,2010,7,6,0,19.09,0,15.8
3,2010-07-07,16.4,16.629999,14.98,15.8,6921700,2010,7,7,0,17.036667,0,17.459999
4,2010-07-08,16.139999,17.52,15.57,17.459999,7711400,2010,7,8,0,16.456667,0,17.4


Data Splitting and Normalization

In [56]:
X = df[['Open', 'High','Low','Volume','is_quarter_end','past_3_days_avg_price']]
y = df['Close'].shift(-1)

X_train ,X_test, y_train, y_test =train_test_split(X , y, test_size=0.20,random_state=42)

In [57]:
np.isnan(y_train).sum() , np.isinf(y_train).sum() , np.isnan(y_test).sum() , np.isinf(y_test).sum() 

(1, 0, 0, 0)

In [58]:
# If NaNs are found in y_train, you can remove or impute:

y_train = np.nan_to_num(y_train)

In [59]:
np.isnan(y_train).sum()

0

In [60]:
# normalization or scaling

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [61]:
np.isnan(X_train_scaled).sum() , np.isinf(X_train_scaled).sum() , np.isnan(X_test_scaled).sum() , np.isinf(X_test_scaled).sum() 

(0, 0, 0, 0)

In [62]:
X_train.shape   , X_test.shape

((1351, 6), (338, 6))

### Model Development and Evaluation

In [63]:
poly = PolynomialFeatures()
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

In [64]:
models = [
    ( "Linear Regression" , 
       {"n_jobs":6 },
       LinearRegression(),
       (X_train_poly, y_train),
       (X_test_poly, y_test)
    ),

    (
        "Random Forest Regressor",
        {'n_estimators':100, "random_state" : 42},
        RandomForestRegressor(),
        (X_train_scaled,y_train),
        (X_test_scaled,y_test)
    ),

    (
        "XGB Regressor",
        {'n_estimators':100, "random_state" : 42},
        XGBRegressor(),
        (X_train_scaled,y_train),
        (X_test_scaled,y_test)
    ),

    (
        "Lasso (L1 Regulization)",
        {"alpha": 0.01, "random_state" : 42},
        Lasso(),
        (X_train_scaled,y_train),
        (X_test_scaled,y_test)
    ),

    (
        "Ridge (L2 Regulization)",
        {"alpha": 0.01, "random_state" : 42},
        Ridge(),
        (X_train_scaled,y_train),
        (X_test_scaled,y_test)
    )
]

In [65]:
reports = []

for model_name , model_params ,model, train_set ,test_set in models:
    X_train1  = train_set[0]
    y_train1 = train_set[1]
    X_test1 = test_set[0]
    y_test1 = test_set[1]

    model.set_params(**model_params)

    model.fit(X_train1 , y_train1)

    y_pred_ml = model.predict(X_test1)
    
    mse = mean_squared_error(y_test1, y_pred_ml)
    accuracy = r2_score(y_test1, y_pred_ml)
    reports.append({
        'model': model_name,
        'mse': mse,
        'r2_score': accuracy
    })
reports

[{'model': 'Linear Regression',
  'mse': 179.4203896358438,
  'r2_score': 0.9802233361899988},
 {'model': 'Random Forest Regressor',
  'mse': 21.791421997093522,
  'r2_score': 0.9975980342721745},
 {'model': 'XGB Regressor',
  'mse': 50.808700458185314,
  'r2_score': 0.9943995964470704},
 {'model': 'Lasso (L1 Regulization)',
  'mse': 23.999010166366826,
  'r2_score': 0.9973547022342536},
 {'model': 'Ridge (L2 Regulization)',
  'mse': 18.591685916985277,
  'r2_score': 0.9979507260975878}]

In [70]:
reports[2]['mse']

50.808700458185314

Among the three models, we have trained Ridge(L2) regularization  has the highest performance & with least MSE

In [66]:
import mlflow
import mlflow.sklearn
import mlflow.xgboost

In [93]:
for i, element in enumerate(models):
    print(i)
    print(element[0])
    print(element[1])
    print(element[2])
    print(reports[i]['mse'])
    print()

0
Linear Regression
{'n_jobs': 6}
LinearRegression(n_jobs=6)
179.4203896358438

1
Random Forest Regressor
{'n_estimators': 100, 'random_state': 42}
RandomForestRegressor(random_state=42)
21.791421997093522

2
XGB Regressor
{'n_estimators': 100, 'random_state': 42}
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=100,
             n_jobs=None, num_parallel_tr

### Initialize MLflow

In [None]:
mlflow.set_experiment("Stock_price_prediction_regression")
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000/")


for i, element in enumerate(models):
    model_name = element[0]
    params = element[1]
    model = element[2]
    report = reports[i]

    with mlflow.start_run(run_name=model_name):
        mlflow.log_params(params)
        mlflow.log_metrics({'MSE':report['mse'],
                            'accuracy':report['r2_score']
                          })
    
        if "XGB Regressor" in model_name:
            mlflow.xgboost.log_model(model , "Model")
        else:
            mlflow.sklearn.log_model(model , "Model")



🏃 View run Linear Regression at: http://127.0.0.1:5000/#/experiments/107442711887276123/runs/3a014f48543d41e8b138d1dda9869bbc
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/107442711887276123




🏃 View run Random Forest Regressor at: http://127.0.0.1:5000/#/experiments/107442711887276123/runs/8eac70c23e544229b558bc2dc1eafa6f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/107442711887276123




🏃 View run XGB Regressor at: http://127.0.0.1:5000/#/experiments/107442711887276123/runs/4413ee77f1614eb38516370b2623c4c5
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/107442711887276123




🏃 View run Lasso (L1 Regulization) at: http://127.0.0.1:5000/#/experiments/107442711887276123/runs/ca3ee7f697f24b3fab1e55dc5dc965f2
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/107442711887276123




🏃 View run Ridge (L2 Regulization) at: http://127.0.0.1:5000/#/experiments/107442711887276123/runs/63a2aef84e1140b4a949fd36ec8212fe
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/107442711887276123


### Register the model

In [97]:
selected_model_name = "Ridge (L2 Regularization)"
run_id = input("Enter runid")
model_uri = f'runs:/{run_id}/model'

with mlflow.start_run(run_id=run_id):
    mlflow.register_model(model_uri=model_uri , name=selected_model_name)

Registered model 'Ridge (L2 Regularization)' already exists. Creating a new version of this model...
2025/05/20 19:07:08 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Ridge (L2 Regularization), version 2


🏃 View run Ridge (L2 Regulization) at: http://127.0.0.1:5000/#/experiments/107442711887276123/runs/63a2aef84e1140b4a949fd36ec8212fe
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/107442711887276123


Created version '2' of model 'Ridge (L2 Regularization)'.


### Load the Model & run it for prediction values

In [98]:
import mlflow.sklearn


model_name = "Ridge (L2 Regularization)"
model_version = 2
model_uri = f'models:/{model_name}/{model_version}'

loaded_model = mlflow.sklearn.load_model(model_uri)
y_pred = loaded_model.predict(X_test1)
y_pred[:5]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

array([207.05447014, 218.99462214, 108.50629188, 183.57006918,
       113.29870888])