# Notebook for "simpler" Machine Learning approaches

This notebook takes the preprocessed data and applies and evaluates different machine learning techniques for predicting future take rates.

In [1]:
import pandas as pd

## Imort dataframe prepared for machine learning (MLbase_DataFrame)

In [34]:
df = pd.read_csv('../CLAAS_data/MLbase_DataFrame.csv')

In [35]:
df

Unnamed: 0,Datum,Merkmal,Merkmalwert,Bestätigte Menge,Relativer Anteil,USTR10Y,WeizenSpot,CornSpot,GER10Y,WtiOilSpot,SoySpot,AgriSpot,Jahr,Monat,Monat_sin,Monat_cos
0,2014-01-01,B10,30,0,0.000000,3.006,197.738,187.924,1.941,98.04,246.732,200.399,2014,1,5.000000e-01,8.660254e-01
1,2014-02-01,B10,30,0,0.000000,2.582,182.795,190.254,1.647,96.43,246.293,201.583,2014,2,8.660254e-01,5.000000e-01
2,2014-03-01,B10,30,0,0.000000,2.608,204.039,200.847,1.563,104.92,269.561,221.079,2014,3,1.000000e+00,6.123234e-17
3,2014-04-01,B10,30,48,0.125654,2.755,222.170,215.466,1.581,99.74,281.258,233.772,2014,4,8.660254e-01,-5.000000e-01
4,2014-05-01,B10,30,42,0.156134,2.613,241.438,212.924,1.470,99.42,242.195,236.510,2014,5,5.000000e-01,-8.660254e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17552,2023-10-01,P02,2078,0,0.000000,4.687,198.708,217.797,2.911,88.82,260.098,234.182,2023,10,-8.660254e-01,5.000000e-01
17553,2023-11-01,P02,2078,10,0.033333,4.761,196.365,214.936,2.758,81.04,263.268,234.431,2023,11,-5.000000e-01,8.660254e-01
17554,2023-12-01,P02,2078,1,0.003846,4.220,199.354,215.360,2.363,73.72,266.732,232.888,2023,12,-2.449294e-16,1.000000e+00
17555,2024-01-01,P02,2078,0,0.000000,3.866,206.624,210.593,2.029,71.89,256.195,228.351,2024,1,5.000000e-01,8.660254e-01


In [33]:
# diese Zelle später rausnehmen (wird in pipeline gemacht)
df.fillna(0, inplace=True)


## Preprocess Data for ML models
### Encode categorical data
### Scale numerical data

In [27]:
from sklearn.preprocessing import LabelEncoder

# Create a copy of the DataFrame
df_encoded = df.copy()

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Iterate over each column in the DataFrame
for column in df_encoded.columns:
    # Check if the column is categorical and not the "Datum" column
    if df_encoded[column].dtype == 'object' and column != 'Datum':
        # Encode the categorical values
        df_encoded[column] = label_encoder.fit_transform(df_encoded[column])

# Print the encoded DataFrame
print(df_encoded)

           Datum  Merkmal  Merkmalwert  Bestätigte Menge  Relativer Anteil  \
0     2014-01-01        0           30                 0          0.000000   
1     2014-02-01        0           30                 0          0.000000   
2     2014-03-01        0           30                 0          0.000000   
3     2014-04-01        0           30                48          0.125654   
4     2014-05-01        0           30                42          0.156134   
...          ...      ...          ...               ...               ...   
17552 2023-10-01        5         2078                 0          0.000000   
17553 2023-11-01        5         2078                10          0.033333   
17554 2023-12-01        5         2078                 1          0.003846   
17555 2024-01-01        5         2078                 0          0.000000   
17556 2024-02-01        5         2078                 5          0.019157   

       USTR10Y  WeizenSpot  CornSpot  GER10Y  WtiOilSpot  SoySp

## Train Test Split

Split data and only use training data for training the models! Test Data 10/23 - 05/24

In [28]:
# Convert the "Datum" column to datetime
df_encoded['Datum'] = pd.to_datetime(df_encoded['Datum'])

# Separate the dataframes
df_train = df_encoded[df_encoded['Datum'] <= pd.to_datetime('2023-09-30')]  # Entries before (including) September 2023 
df_test = df_encoded[df_encoded['Datum']  >= pd.to_datetime('2023-10-01')]  # Entries from October 2023 onwards

# drop "Datum" column, only use Jahr, Monat, Monat_sin and Monat_cos
df_train = df_train.drop(columns=['Datum'])
df_test = df_test.drop(columns=['Datum'])

In [29]:
X_test = df_test.drop(['Bestätigte Menge', 'Relativer Anteil'], axis=1)
y_test = df_test['Relativer Anteil']

In [30]:
X_training = df_train.drop(['Bestätigte Menge', 'Relativer Anteil'], axis=1)
y_training = df_train['Relativer Anteil']

In [31]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_training, y_training, test_size=0.2, random_state=42)

## Apply different models

### Linear Regression

In [32]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score



# Create an instance of the Linear Regression model
model = LinearRegression()

# Fit the model on the training data
model.fit(X_train, y_train)

# Predict the target variable for the training data
y_train_pred = model.predict(X_train)

# Calculate the accuracy (R-squared) on the training data
train_accuracy = r2_score(y_train, y_train_pred)

# Predict the target variable for the validation data
y_val_pred = model.predict(X_val)

# Calculate the accuracy (R-squared) on the validation data
val_accuracy = r2_score(y_val, y_val_pred)

# Print the accuracies
print("Training Accuracy:", train_accuracy)
print("Validation Accuracy:", val_accuracy)

ValueError: Input y contains NaN.