In [1]:
pip install -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
import yfinance as yf
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

In [3]:
df = pd.read_csv('SXR8.DE_data.csv')

In [4]:
print(df.head())

                        Date       Open       High        Low      Close  \
0  2010-05-19 00:00:00+02:00  92.669998  92.669998  92.669998  92.669998   
1  2010-05-20 00:00:00+02:00  92.669998  92.669998  92.669998  92.669998   
2  2010-05-21 00:00:00+02:00  92.669998  92.669998  92.669998  92.669998   
3  2010-05-24 00:00:00+02:00  92.669998  92.669998  92.669998  92.669998   
4  2010-05-25 00:00:00+02:00  92.669998  92.669998  92.669998  92.669998   

   Volume  Dividends  Stock Splits  Capital Gains  
0       0        0.0           0.0            0.0  
1       0        0.0           0.0            0.0  
2       0        0.0           0.0            0.0  
3       0        0.0           0.0            0.0  
4       0        0.0           0.0            0.0  


Features Pipeline

Implement data cleaning, transformation, and feature engineering steps that are necessary for modeling

In [7]:
# Assuming 'Date' is a column and 'Price' is the target; adjust column names as necessary
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features.remove('Close')  # Remove the target column from the feature list

# Create pipelines for numeric preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Fill missing values with median
    ('scaler', StandardScaler())  # Scale features
])

# Combine into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

# Full pipeline: preprocessing and modeling
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

Training Pipeline

Train and evaluate models using cross-validation, including hyperparameter tuning. Choose a model based on insights from your EDA.

In [8]:
# Prepare features and target variable
X = df[numeric_features]
y = df['Close']

# Perform cross-validation
scores = cross_val_score(model_pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
print(f"Cross-validated MSE scores: {scores}")
print(f"Average MSE: {np.mean(scores)}")

# Fit the model (optional here, could fit on a specific train split instead)
model_pipeline.fit(X, y)

# Save the trained model
joblib.dump(model_pipeline, 'linear_regression_model.pkl')

Cross-validated MSE scores: [-0.00845732 -0.39770383 -0.28062101 -1.16529837 -1.54439989]
Average MSE: -0.6792960838775578


['linear_regression_model.pkl']

Inference Pipeline

Ensure that the model can receive new data, preprocess it it the same way as the trainning data, and output predictions


Chat GPT code

In [10]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
import joblib


In [12]:
# Load dataset
df = pd.read_csv('SXR8.DE_data.csv')

# Assuming 'Price' is the target variable
numeric_features = [col for col in df.columns if df[col].dtype in ['int64', 'float64'] and col != 'Price']

# Create pipelines for numeric preprocessing
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Fill missing values with median
    ('scaler', StandardScaler())  # Scale features
])

# Combine into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
    ])

# Full pipeline: preprocessing and modeling
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Prepare features and target variable
X = df[numeric_features]
y = df['Close']

# Perform cross-validation
scores = cross_val_score(model_pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
print(f"Cross-validated MSE scores: {scores}")
print(f"Average MSE: {np.mean(scores)}")

# Fit the model
model_pipeline.fit(X, y)

# Save the trained model
joblib.dump(model_pipeline, 'linear_regression_model.pkl')

                        Date       Open       High        Low      Close  \
0  2010-05-19 00:00:00+02:00  92.669998  92.669998  92.669998  92.669998   
1  2010-05-20 00:00:00+02:00  92.669998  92.669998  92.669998  92.669998   
2  2010-05-21 00:00:00+02:00  92.669998  92.669998  92.669998  92.669998   
3  2010-05-24 00:00:00+02:00  92.669998  92.669998  92.669998  92.669998   
4  2010-05-25 00:00:00+02:00  92.669998  92.669998  92.669998  92.669998   

   Volume  Dividends  Stock Splits  Capital Gains  
0       0        0.0           0.0            0.0  
1       0        0.0           0.0            0.0  
2       0        0.0           0.0            0.0  
3       0        0.0           0.0            0.0  
4       0        0.0           0.0            0.0  
Cross-validated MSE scores: [-4.66273237e-27 -1.76768931e-27 -1.78648845e-27 -3.65387020e-27
 -4.84480173e-25]
Average MSE: -9.927019073063398e-26


['linear_regression_model.pkl']