In [None]:
import pandas as pd
import matplotlib.pyplot as plt

#Used for month, day_of_week, hour_of_day, weekday and holiday

data = pd.read_csv("../Training/training_data_VT2026.csv")

# Make buinary variable for high bike demand:
# high_bike_demand -> 1
# low_bike_demand  -> 0
data["high_demand"] = (data["increase_stock"] == "high_bike_demand").astype(int)


# Share of high demand
weekday_high_rate = data.groupby("precip")["high_demand"].mean()


plt.figure(figsize=(10, 5))
plt.bar(weekday_high_rate.index, weekday_high_rate.values)

plt.xlabel("Weekday )")
plt.ylabel("Proportion of high bike demand")
plt.title("High bike demand depending on weekday or weekend")

plt.ylim(0, 1)
plt.xticks(range(0, 2))
plt.grid(axis="y")
plt.tight_layout()
plt.savefig("demand_over_weekday.png", bbox_inches="tight", pad_inches=0)
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

data["high_demand"] = (data["increase_stock"] == "high_bike_demand").astype(int)

# Binary rain-variable
data["rain"] = (data["precip"] > 0).astype(int)

summary = (
    data.groupby("rain")
    .agg(
        high_rate=("high_demand", "mean"),
        count=("high_demand", "size")
    )
)

plt.figure(figsize=(6, 4))
plt.bar(["No rain", "Rain"], summary["high_rate"])

plt.ylabel("Proportion of high bike demand")
plt.title("High bike demand on rainy vs non-rainy days")
plt.ylim(0, 1)
plt.grid(axis="y", alpha=0.3)

for i, n in enumerate(summary["count"]):
    plt.text(i, 0.02, f"n={n}", ha="center", fontsize=10)

plt.tight_layout()
plt.savefig("precip.png", bbox_inches="tight", pad_inches=0)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv("../Training/training_data_VT2026.csv")
data["high_demand"] = (data["increase_stock"] == "high_bike_demand").astype(int)

# Binär snow-variable
data["snow_on_ground"] = (data["snowdepth"] > 0).astype(int)

summary = (
    data.groupby("snow_on_ground")
    .agg(
        high_rate=("high_demand", "mean"),
        count=("high_demand", "size")
    )
)

plt.figure(figsize=(6, 4))
plt.bar(["No snow", "Snow on ground"], summary["high_rate"])

plt.ylabel("Proportion of high bike demand")
plt.title("High bike demand with vs without snow on ground")
plt.ylim(0, 1)
plt.grid(axis="y", alpha=0.3)

for i, n in enumerate(summary["count"]):
    plt.text(i, 0.02, f"n={n}", ha="center", fontsize=10)

plt.tight_layout()
plt.savefig("snow_on_ground.png", bbox_inches="tight", pad_inches=0)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

data = pd.read_csv("../Training/training_data_VT2026.csv")
data["high_demand"] = (data["increase_stock"] == "high_bike_demand").astype(int)

# Define even bins for humidity
bins = np.arange(15, 100, 10)  # 15–95 i steg om 10
data["humidity_bin"] = pd.cut(data["humidity"], bins=bins)

summary = (
    data.groupby("humidity_bin")
    .agg(
        high_rate=("high_demand", "mean"),
        count=("high_demand", "size")
    )
)

plt.figure(figsize=(8, 4))
plt.bar(range(len(summary)), summary["high_rate"])

plt.xticks(
    range(len(summary)),
    [f"{int(b.left)}–{int(b.right)}" for b in summary.index],
)

plt.xlabel("Humidity (%)")
plt.ylabel("Proportion of high bike demand")
plt.title("High bike demand vs humidity")
plt.ylim(0, 1)
plt.grid(axis="y", alpha=0.3)

# Add count of observations above bars
for i, n in enumerate(summary["count"]):
    plt.text(i, 0.02, f"n={n}", ha="center", fontsize=9)

plt.tight_layout()
plt.savefig("humidity.png", bbox_inches="tight", pad_inches=0)
plt.show()


Peak runt 17-18 på kvällen!

Peak på lördagar! (och lite söndagar)

Peak Juni, april, september (oktober)

Inte jättestor skillnad på holiday eller ej

Större efterfrågan på helger!

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sys

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, recall_score, accuracy_score


sys.path.append("../src")
from data_preprocessing import get_pipeline, X, y

print("X shape:", X.shape)
print("y shape:", y.shape)
print("X index head:", X.index[:5])
print("y index head:", y.index[:5])
print("Index match:", X.index.equals(y.index))


# ============================================================
# CV
# ============================================================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ============================================================
# 1) BASELINE (otunad) + class_weight
# ============================================================
baseline_model = LogisticRegression(
    max_iter=5000,
    random_state=42,
    class_weight={"high_bike_demand": 10, "low_bike_demand": 1}
)

baseline_pipe = get_pipeline(baseline_model)

y_pred_base = cross_val_predict(baseline_pipe, X, y, cv=cv, n_jobs=-1)

print("=== BASELINE (otunad + class_weight) ===")
print("Accuracy:", accuracy_score(y, y_pred_base))
print("Recall (high_bike_demand):", recall_score(y, y_pred_base, pos_label="high_bike_demand"))
print("\nClassification Report (Baseline):")
print(classification_report(y, y_pred_base, zero_division=0))

ConfusionMatrixDisplay.from_predictions(y, y_pred_base)
plt.title("Confusion Matrix - Baseline")
plt.show()

# ============================================================
# 2) GRID SEARCH (C + solver)
# ============================================================
tune_model = LogisticRegression(
    max_iter=5000,
    random_state=42,
    class_weight={"high_bike_demand": 10, "low_bike_demand": 1}
)

tune_pipe = get_pipeline(tune_model)

param_grid = {
    "model__C": np.logspace(-4, 2, 7),
    "model__solver": ["lbfgs", "liblinear", "newton-cg", "sag", "saga"]
}

gs = GridSearchCV(
    tune_pipe,
    param_grid=param_grid,
    scoring="accuracy",   # byt till recall_high om du vill optimera recall
    cv=cv,
    n_jobs=-1,
    error_score=0,
    refit=True
)

gs.fit(X, y)

print("\n=== GRID SEARCH RESULT ===")
print("Bästa parametrar:", gs.best_params_)
print("Bästa CV-score (accuracy):", gs.best_score_)

best_pipe = gs.best_estimator_
y_pred_tuned = cross_val_predict(best_pipe, X, y, cv=cv, n_jobs=-1)

print("\n=== TUNED (best C + solver) ===")
print("Accuracy:", accuracy_score(y, y_pred_tuned))
print("Recall (high_bike_demand):", recall_score(y, y_pred_tuned, pos_label="high_bike_demand"))
print("\nClassification Report (Tuned):")
print(classification_report(y, y_pred_tuned, zero_division=0))

ConfusionMatrixDisplay.from_predictions(y, y_pred_tuned)
plt.title("Confusion Matrix - Tuned")
plt.show()


KeyError: "None of [Index([  75, 1284,  408, 1282, 1447, 1144, 1381,  181, 1183, 1103,\n       ...\n       1053, 1005, 1418,  639,  537,  890,  146, 1552, 1210, 1102],\n      dtype='int64', length=320)] are in the [index]"

Traceback (most recent call last):
  File "/Users/ivareriksson/.pyenv/versions/3.13.0/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 916, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/ivareriksson/.pyenv/versions/3.13.0/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 317, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/ivareriksson/.pyenv/versions/3.13.0/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 409, in _score
    y_pred = method_caller(
        estimator,
    ...<2 lines>...
        pos_label=pos_label,
    )
  File "/Users/ivareriksson/.pyenv/versions/3.13.0/lib/python3.13/site-packages/sklearn/metrics/_scorer.py", line 96, in _cached_call
    result, _ = _get_response_values(
                ~~~~~~~~~~~~~~~~~~~~^
        estimator, 

Bästa parametrar: {'model__C': 0.001, 'model__penalty': 'l1', 'model__solver': 'liblinear'}
Bästa CV-score: nan
                  precision    recall  f1-score   support

high_bike_demand       0.18      1.00      0.31       288
 low_bike_demand       0.00      0.00      0.00      1312

        accuracy                           0.18      1600
       macro avg       0.09      0.50      0.15      1600
    weighted avg       0.03      0.18      0.05      1600


Valda features (|coef|>0):


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


increase_stock
low_bike_demand     1312
high_bike_demand     288
Name: count, dtype: int64
