# This notebook implement a baseline model in linear regression 

## Import and prepare dataset
Load the dataset out of .pkl file we preapared ind file "DataPreparation.ipynb".

In [16]:
# import all needed libs
import pandas as pd
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error, r2_score

# read training data into datadframe
train_set = pd.read_pickle('../train_val_test_data/train_set.pkl')

## Try the first version of the model

In [18]:
# import all needed libs (if needed pip install)
import statsmodels.formula.api as smf

mod = smf.ols('Umsatz ~ Temperatur + monthly_mean_temp_diff + C(Warengruppe)', data=train_set).fit()

print(mod.summary())

                            OLS Regression Results                            
Dep. Variable:                 Umsatz   R-squared:                       0.696
Model:                            OLS   Adj. R-squared:                  0.695
Method:                 Least Squares   F-statistic:                     2443.
Date:                Sun, 02 Jun 2024   Prob (F-statistic):               0.00
Time:                        16:01:06   Log-Likelihood:                -43608.
No. Observations:                7493   AIC:                         8.723e+04
Df Residuals:                    7485   BIC:                         8.729e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                 59

## Validate the model 

In [25]:
# read validation data into datadframe
validation_set = pd.read_pickle('../train_val_test_data/validation_set.pkl')

# Remove rows with NaN values in 'Umsatz' from the validation_set
# Potential TO-DO: look why we have this 8 rows of NaN data?
validation_set = validation_set.dropna(subset=['Umsatz'])

# Make predictions on the validation data
validation_set['Umsatz_predictions'] = mod.predict(validation_set)

# Calculate evaluation metrics
mse = mean_squared_error(validation_set['Umsatz'], validation_set['Umsatz_predictions'])
r2 = r2_score(validation_set['Umsatz'], validation_set['Umsatz_predictions'])

print("Validation results")
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Validation results
Mean Squared Error: 5538.721369179292
R^2 Score: 0.6726461606826961


## Make predictions based on model above 

In [27]:
# load testset
test_set = pd.read_pickle('../train_val_test_data/test_set.pkl')

# calculate predictions for later upload 
test_set['Umsatz'] = mod.predict(test_set)

test_set.head()

# reduce to id and Umsatz columns 
submission_set = test_set[['id','Umsatz']]

# Check if the count of dataset is correct for kaggle upload
if submission_set.shape[0] == 1830:
    print("OK : DataFrame has exact 1830 Entries!")
else:
    print(f"ERROR Dataframe has wrong number of {df.shape[0]} Entries!")

# store the submission data
submission_set.to_csv('../prediction_data/submission.csv', index=False)


OK : DataFrame has exact 1830 Entries!
