1. Data Loading 

In [106]:
import pandas as pd

df = pd.read_csv('stock_data.csv')
df

Unnamed: 0.1,Unnamed: 0,Stock_1,Stock_2,Stock_3,Stock_4,Stock_5
0,2020-01-01,101.764052,100.160928,99.494642,99.909756,101.761266
1,2020-01-02,102.171269,99.969968,98.682973,100.640755,102.528643
2,2020-01-03,103.171258,99.575237,98.182139,100.574847,101.887811
3,2020-01-04,105.483215,99.308641,97.149381,100.925017,101.490049
4,2020-01-05,107.453175,98.188428,99.575396,101.594411,101.604283
...,...,...,...,...,...,...
360,2020-12-26,92.684784,63.408103,98.288992,117.788079,102.995720
361,2020-12-27,92.688279,62.816639,98.061845,116.605106,102.718260
362,2020-12-28,93.551993,63.597651,96.454800,115.441164,103.566068
363,2020-12-29,93.870037,64.114492,95.747485,113.856107,103.257107


2. Data Preprocessing

In [107]:
# 1--> Rename & Convert Date Column
import pandas as pd


df = df.rename(columns={"Unnamed: 0": "Date"})
df["Date"] = pd.to_datetime(df["Date"])

df.set_index("Date")




Unnamed: 0_level_0,Stock_1,Stock_2,Stock_3,Stock_4,Stock_5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-01,101.764052,100.160928,99.494642,99.909756,101.761266
2020-01-02,102.171269,99.969968,98.682973,100.640755,102.528643
2020-01-03,103.171258,99.575237,98.182139,100.574847,101.887811
2020-01-04,105.483215,99.308641,97.149381,100.925017,101.490049
2020-01-05,107.453175,98.188428,99.575396,101.594411,101.604283
...,...,...,...,...,...
2020-12-26,92.684784,63.408103,98.288992,117.788079,102.995720
2020-12-27,92.688279,62.816639,98.061845,116.605106,102.718260
2020-12-28,93.551993,63.597651,96.454800,115.441164,103.566068
2020-12-29,93.870037,64.114492,95.747485,113.856107,103.257107


In [108]:
# 2--> Handle Missing Values
df.isnull().sum()
df.fillna(method="ffill")
df

  df.fillna(method="ffill")


Unnamed: 0,Date,Stock_1,Stock_2,Stock_3,Stock_4,Stock_5
0,2020-01-01,101.764052,100.160928,99.494642,99.909756,101.761266
1,2020-01-02,102.171269,99.969968,98.682973,100.640755,102.528643
2,2020-01-03,103.171258,99.575237,98.182139,100.574847,101.887811
3,2020-01-04,105.483215,99.308641,97.149381,100.925017,101.490049
4,2020-01-05,107.453175,98.188428,99.575396,101.594411,101.604283
...,...,...,...,...,...,...
360,2020-12-26,92.684784,63.408103,98.288992,117.788079,102.995720
361,2020-12-27,92.688279,62.816639,98.061845,116.605106,102.718260
362,2020-12-28,93.551993,63.597651,96.454800,115.441164,103.566068
363,2020-12-29,93.870037,64.114492,95.747485,113.856107,103.257107


In [109]:
#3--> Outlier Detection & Treatment

import numpy as np

def cap_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return np.clip(series, lower, upper)

df = df.apply(cap_outliers)

df


Unnamed: 0,Date,Stock_1,Stock_2,Stock_3,Stock_4,Stock_5
0,2020-01-01,101.764052,100.160928,99.494642,99.909756,101.761266
1,2020-01-02,102.171269,99.969968,98.682973,100.640755,102.528643
2,2020-01-03,103.171258,99.575237,98.182139,100.574847,101.887811
3,2020-01-04,105.483215,99.308641,97.149381,100.925017,101.490049
4,2020-01-05,107.453175,98.188428,99.575396,101.594411,101.604283
...,...,...,...,...,...,...
360,2020-12-26,92.684784,63.408103,98.288992,117.788079,102.995720
361,2020-12-27,92.688279,62.816639,98.061845,116.605106,102.718260
362,2020-12-28,93.551993,63.597651,96.454800,115.441164,103.566068
363,2020-12-29,93.870037,64.114492,95.747485,113.856107,103.257107


In [110]:
# 4--> Feature Engineering

returns = df.select_dtypes(include=['number']).pct_change().add_suffix("_return")


df = pd.concat([df, returns], axis=1)
df.head()


Unnamed: 0,Date,Stock_1,Stock_2,Stock_3,Stock_4,Stock_5,Stock_1_return,Stock_2_return,Stock_3_return,Stock_4_return,Stock_5_return
0,2020-01-01,101.764052,100.160928,99.494642,99.909756,101.761266,,,,,
1,2020-01-02,102.171269,99.969968,98.682973,100.640755,102.528643,0.004002,-0.001907,-0.008158,0.007317,0.007541
2,2020-01-03,103.171258,99.575237,98.182139,100.574847,101.887811,0.009787,-0.003948,-0.005075,-0.000655,-0.00625
3,2020-01-04,105.483215,99.308641,97.149381,100.925017,101.490049,0.022409,-0.002677,-0.010519,0.003482,-0.003904
4,2020-01-05,107.453175,98.188428,99.575396,101.594411,101.604283,0.018676,-0.01128,0.024972,0.006633,0.001126


In [111]:
# 5--> Create Lag Feature 

df["Stock_1_next"] = df["Stock_1"].shift(-1)

df = df.dropna()
df


Unnamed: 0,Date,Stock_1,Stock_2,Stock_3,Stock_4,Stock_5,Stock_1_return,Stock_2_return,Stock_3_return,Stock_4_return,Stock_5_return,Stock_1_next
1,2020-01-02,102.171269,99.969968,98.682973,100.640755,102.528643,0.004002,-0.001907,-0.008158,0.007317,0.007541,103.171258
2,2020-01-03,103.171258,99.575237,98.182139,100.574847,101.887811,0.009787,-0.003948,-0.005075,-0.000655,-0.006250,105.483215
3,2020-01-04,105.483215,99.308641,97.149381,100.925017,101.490049,0.022409,-0.002677,-0.010519,0.003482,-0.003904,107.453175
4,2020-01-05,107.453175,98.188428,99.575396,101.594411,101.604283,0.018676,-0.011280,0.024972,0.006633,0.001126,106.403059
5,2020-01-06,106.403059,98.463789,97.339608,100.472182,100.938222,-0.009773,0.002804,-0.022453,-0.011046,-0.006555,107.413982
...,...,...,...,...,...,...,...,...,...,...,...,...
359,2020-12-25,92.041911,63.975683,99.160775,118.698143,103.799434,0.006204,-0.016018,0.000518,0.000015,-0.008039,92.684784
360,2020-12-26,92.684784,63.408103,98.288992,117.788079,102.995720,0.006985,-0.008872,-0.008792,-0.007667,-0.007743,92.688279
361,2020-12-27,92.688279,62.816639,98.061845,116.605106,102.718260,0.000038,-0.009328,-0.002311,-0.010043,-0.002694,93.551993
362,2020-12-28,93.551993,63.597651,96.454800,115.441164,103.566068,0.009318,0.012433,-0.016388,-0.009982,0.008254,93.870037


In [112]:
# 6--> Feature Scaling
from sklearn.preprocessing import StandardScaler

X = df.select_dtypes(include=['number']).drop(columns=["Stock_1_next"])
y = df["Stock_1_next"]

scaler = StandardScaler()
X_scaled = pd.DataFrame(
    scaler.fit_transform(X),
    columns=X.columns,
    index=X.index
)

print("Scaled features preview:")
display(X_scaled.head())


Scaled features preview:


Unnamed: 0,Stock_1,Stock_2,Stock_3,Stock_4,Stock_5,Stock_1_return,Stock_2_return,Stock_3_return,Stock_4_return,Stock_5_return
1,-0.767939,1.659205,0.641997,-2.503999,-1.164314,0.422895,-0.072136,-0.863065,0.700634,0.75762
2,-0.632171,1.6245,0.565189,-2.513806,-1.335255,1.008914,-0.274021,-0.534436,-0.10787,-0.644411
3,-0.318278,1.60106,0.406806,-2.461702,-1.441357,2.287295,-0.148344,-1.114742,0.311681,-0.405879
4,-0.050818,1.50257,0.778858,-2.362097,-1.410885,1.90916,-0.998884,2.668677,0.631259,0.105426
5,-0.193391,1.52678,0.435979,-2.529083,-1.588555,-0.972249,0.393626,-2.386984,-1.161802,-0.675435


3. Pipeline Creation

In [113]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=False
)

# Create ML Pipeline
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])

# Train model
pipeline.fit(X_train, y_train)

# Evaluate
score = pipeline.score(X_test, y_test)
print("RÂ² score on test set:", score)


RÂ² score on test set: 0.9588865415714747


4. Primary Model Selection

I chose Linear Regression as the primary model.

It works well with numerical stock data.

It is easy to understand and interpret.

It is fast to train and efficient.

It is a good baseline model for stock price prediction.

In [114]:
from sklearn.linear_model import LinearRegression

primary_model = LinearRegression()
primary_model

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


5.Model Training

In [115]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

# Split data (keep time order â†’ no shuffling for stock data)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)

# Build training pipeline
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])

# Train the model
pipeline.fit(X_train, y_train)

# Training and test performance
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)

print("Training RÂ²:", train_score)
print("Test RÂ²:", test_score)


Training RÂ²: 0.9741407419031873
Test RÂ²: 0.9588865415714747


6.Cross-Validation

In [116]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

tscv = TimeSeriesSplit(n_splits=5)

cv_scores = cross_val_score(
    pipeline, X_train, y_train,
    cv=tscv,
    scoring="r2"
)

print("Cross-validation scores:", cv_scores)
print("Mean CV Score:", cv_scores.mean())
print("Std Dev:", cv_scores.std())


Cross-validation scores: [0.06305562 0.92263153 0.9288379  0.69798804 0.85082968]
Mean CV Score: 0.6926685551491818
Std Dev: 0.3256116135104127


7.Hyperparameter Tuning

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

ridge_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", Ridge())
])

# Parameter grid
param_grid = {
    "model__alpha": [0.01, 0.1, 1, 10, 100]
}

grid_search = GridSearchCV(
    ridge_pipeline,
    param_grid=param_grid,
    cv=tscv,
    scoring="r2"
)

grid_search.fit(X_train, y_train)

print("Tested Parameters:", grid_search.cv_results_["params"])
print("Best Parameter:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)


Tested Parameters: [{'model__alpha': 0.01}, {'model__alpha': 0.1}, {'model__alpha': 1}, {'model__alpha': 10}, {'model__alpha': 100}]
Best Parameter: {'model__alpha': 1}
Best CV Score: 0.6990702695447439


In [118]:
best_model = grid_search.best_estimator_

best_model.fit(X_train, y_train)

final_test_score = best_model.score(X_test, y_test)
print("Final Tuned Test RÂ²:", final_test_score)


Final Tuned Test RÂ²: 0.9588551826076634


8.Best Model Selection

In [119]:
# Select final best model from GridSearch
final_model = grid_search.best_estimator_

print("Final Selected Model:")
print(final_model)

print("\nBest Hyperparameters Found:")
print(grid_search.best_params_)

print("\nBest Cross-Validation Score:")
print(grid_search.best_score_)


Final Selected Model:
Pipeline(steps=[('scaler', StandardScaler()), ('model', Ridge(alpha=1))])

Best Hyperparameters Found:
{'model__alpha': 1}

Best Cross-Validation Score:
0.6990702695447439


9.Model Performance Evaluation

In [120]:
import numpy as np
from sklearn.metrics import (
    r2_score, 
    mean_squared_error, 
    mean_absolute_error
)

# Make predictions on test set
y_pred = final_model.predict(X_test)

# Calculate metrics
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)

print("----- Test Set Performance -----")
print(f"RÂ² Score: {r2:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")


----- Test Set Performance -----
RÂ² Score: 0.9589
MSE: 0.9082
RMSE: 0.9530
MAE: 0.7852


In [121]:
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
print(f"MAPE: {mape:.2f}%")


MAPE: 0.79%


10.Web Interface with Gradio

In [122]:
# Save your trained model
import joblib

# Save the trained model
joblib.dump(final_model, "stock_model.pkl")
print("Model saved as stock_model.pkl")


Model saved as stock_model.pkl


In [123]:
#Create the Gradio App
import gradio as gr
import joblib
import numpy as np
import pandas as pd

# Load trained model
model = joblib.load("stock_model.pkl")

# Get feature names from training data
feature_names = X.columns

def predict_stock(*inputs):
    """
    Gradio prediction function
    """
    # Convert inputs to DataFrame with correct column names
    input_df = pd.DataFrame([inputs], columns=feature_names)

    # Make prediction
    prediction = model.predict(input_df)[0]

    return float(prediction)

# Create Gradio interface
iface = gr.Interface(
    fn=predict_stock,
    inputs=[gr.Number(label=f) for f in feature_names],
    outputs=gr.Number(label="Predicted Next-Day Stock_1 Price"),
    title="ðŸ“ˆ Stock Price Prediction App",
    description="Enter today's stock features to predict tomorrow's Stock_1 price."
)

iface.launch()


ModuleNotFoundError: No module named 'gradio'

In [None]:
with gr.Blocks() as app:
    gr.Markdown("##  Stock Price Prediction")

    inputs = [gr.Number(label=f) for f in feature_names]

    predict_btn = gr.Button("Predict Price")

    output = gr.Number(label="Predicted Next-Day Stock_1 Price")

    predict_btn.click(
        fn=predict_stock,
        inputs=inputs,
        outputs=output
    )

app.launch()


11.Deployment to Hugging Face Spaces

In [None]:
import gradio as gr
import joblib
import pandas as pd

# Load trained model
model = joblib.load("stock_model.pkl")

# Define feature names (must match your training features)
feature_names = model.feature_names_in_

def predict_stock(*inputs):
    input_df = pd.DataFrame([inputs], columns=feature_names)
    prediction = model.predict(input_df)[0]
    return float(prediction)

with gr.Blocks() as app:
    gr.Markdown("## ðŸ“ˆ Stock Price Prediction App")

    inputs = [gr.Number(label=f) for f in feature_names]
    output = gr.Number(label="Predicted Next-Day Stock_1 Price")

    gr.Button("Predict").click(
        fn=predict_stock,
        inputs=inputs,
        outputs=output
    )

app.launch()


In [None]:
gradio
pandas
scikit-learn
joblib
