In [None]:
import pandas as pd
import matplotlib.pyplot as plt

#Used for month, day_of_week, hour_of_day, weekday and holiday

data = pd.read_csv("../Training/training_data_VT2026.csv")

# Make buinary variable for high bike demand:
# high_bike_demand -> 1
# low_bike_demand  -> 0
data["high_demand"] = (data["increase_stock"] == "high_bike_demand").astype(int)


# Share of high demand
weekday_high_rate = data.groupby("precip")["high_demand"].mean()


plt.figure(figsize=(10, 5))
plt.bar(weekday_high_rate.index, weekday_high_rate.values)

plt.xlabel("Weekday )")
plt.ylabel("Proportion of high bike demand")
plt.title("High bike demand depending on weekday or weekend")

plt.ylim(0, 1)
plt.xticks(range(0, 2))
plt.grid(axis="y")
plt.tight_layout()
plt.savefig("demand_over_weekday.png", bbox_inches="tight", pad_inches=0)
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

data["high_demand"] = (data["increase_stock"] == "high_bike_demand").astype(int)

# Binary rain-variable
data["rain"] = (data["precip"] > 0).astype(int)

summary = (
    data.groupby("rain")
    .agg(
        high_rate=("high_demand", "mean"),
        count=("high_demand", "size")
    )
)

plt.figure(figsize=(6, 4))
plt.bar(["No rain", "Rain"], summary["high_rate"])

plt.ylabel("Proportion of high bike demand")
plt.title("High bike demand on rainy vs non-rainy days")
plt.ylim(0, 1)
plt.grid(axis="y", alpha=0.3)

for i, n in enumerate(summary["count"]):
    plt.text(i, 0.02, f"n={n}", ha="center", fontsize=10)

plt.tight_layout()
plt.savefig("precip.png", bbox_inches="tight", pad_inches=0)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv("../Training/training_data_VT2026.csv")
data["high_demand"] = (data["increase_stock"] == "high_bike_demand").astype(int)

# Binär snow-variable
data["snow_on_ground"] = (data["snowdepth"] > 0).astype(int)

summary = (
    data.groupby("snow_on_ground")
    .agg(
        high_rate=("high_demand", "mean"),
        count=("high_demand", "size")
    )
)

plt.figure(figsize=(6, 4))
plt.bar(["No snow", "Snow on ground"], summary["high_rate"])

plt.ylabel("Proportion of high bike demand")
plt.title("High bike demand with vs without snow on ground")
plt.ylim(0, 1)
plt.grid(axis="y", alpha=0.3)

for i, n in enumerate(summary["count"]):
    plt.text(i, 0.02, f"n={n}", ha="center", fontsize=10)

plt.tight_layout()
plt.savefig("snow_on_ground.png", bbox_inches="tight", pad_inches=0)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

data = pd.read_csv("../Training/training_data_VT2026.csv")
data["high_demand"] = (data["increase_stock"] == "high_bike_demand").astype(int)

# Define even bins for humidity
bins = np.arange(15, 100, 10)  # 15–95 i steg om 10
data["humidity_bin"] = pd.cut(data["humidity"], bins=bins)

summary = (
    data.groupby("humidity_bin")
    .agg(
        high_rate=("high_demand", "mean"),
        count=("high_demand", "size")
    )
)

plt.figure(figsize=(8, 4))
plt.bar(range(len(summary)), summary["high_rate"])

plt.xticks(
    range(len(summary)),
    [f"{int(b.left)}–{int(b.right)}" for b in summary.index],
)

plt.xlabel("Humidity (%)")
plt.ylabel("Proportion of high bike demand")
plt.title("High bike demand vs humidity")
plt.ylim(0, 1)
plt.grid(axis="y", alpha=0.3)

# Add count of observations above bars
for i, n in enumerate(summary["count"]):
    plt.text(i, 0.02, f"n={n}", ha="center", fontsize=9)

plt.tight_layout()
plt.savefig("humidity.png", bbox_inches="tight", pad_inches=0)
plt.show()


Peak runt 17-18 på kvällen!

Peak på lördagar! (och lite söndagar)

Peak Juni, april, september (oktober)

Inte jättestor skillnad på holiday eller ej

Större efterfrågan på helger!

In [5]:
import sys
import sklearn.linear_model as skl_lm
import numpy as np
import pandas as pd

sys.path.append('../src')

from data_preprocessing import X_scaled, df, y
X_scaled.head()


train1 = np.random.choice(X_scaled.shape[0], size= round(0.7*1600), replace=False)

trainIndex = X_scaled.index.isin(train1)

train = X_scaled.iloc[trainIndex]
y_train = y.iloc[trainIndex]

test = X_scaled.iloc[~trainIndex]
y_test = y.iloc[~trainIndex]

model = skl_lm.LogisticRegression().fit(train, y_train)


y_hat = model.predict(test)

correct_predictions = (y_hat == y_test).sum()
print(f"Total predictions: {len(y_hat)}")
print(f"Fraction correct predictions: {correct_predictions /  len(y_hat)}")

confusion_matrix = pd.crosstab(y_test, y_hat, rownames=['Actual'], colnames=['Predicted'])
print('Confusion matrix: \n')
display(confusion_matrix)








Total predictions: 480
Fraction correct predictions: 0.85625
Confusion matrix: 



Predicted,high_bike_demand,low_bike_demand
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
high_bike_demand,48,44
low_bike_demand,25,363


In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
    f1_score, make_scorer
)

# ---- Split ----
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y,
    test_size=0.30,
    random_state=42,
    stratify=y
)

# ---- Scorer som INTE ger NaN ----
# (om modellen råkar predicera bara en klass i en fold -> f1 blir 0 istället för NaN)
f1_scorer = make_scorer(
    f1_score,
    pos_label="high_bike_demand",
    zero_division=0
)

# ---- CV ----
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# ---- Modell ----
# sätter class_weight="balanced" direkt för att hantera obalans
base_model = LogisticRegression(
    max_iter=5000,
    random_state=42
)

# ---- Parametergrid (kompatibelt med solvers) ----
param_grid = [
    {   # liblinear: L1/L2
        "solver": ["liblinear"],
        "penalty": ["l1", "l2"],
        "C": [0.01, 0.1, 1, 10, 100],
    },
    {   # saga: L1/L2
        "solver": ["saga"],
        "penalty": ["l1", "l2"],
        "C": [0.01, 0.1, 1, 10, 100],
    }
]

grid = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    scoring=f1_scorer,
    cv=cv,
    n_jobs=-1,
    verbose=1,
    error_score="raise"   # om något är fel i grid: crasha tydligt
)

grid.fit(X_train, y_train)

print("\nBest parameters:", grid.best_params_)
print("Best CV F1:", grid.best_score_)

best_model = grid.best_estimator_

# ---- Test-utvärdering ----
y_pred = best_model.predict(X_test)

print("\nTest accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred, zero_division=0))

cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Pred 0", "Pred 1"])
print("\nConfusion matrix:\n", cm_df)

# ---- Toppresultat från CV ----
results = pd.DataFrame(grid.cv_results_).sort_values("mean_test_score", ascending=False)
cols = ["mean_test_score", "std_test_score", "param_solver", "param_penalty", "param_C"]
print("\nTop 10 CV results:\n", results[cols].head(10).to_string(index=False))


Fitting 5 folds for each of 20 candidates, totalling 100 fits




ValueError: pos_label=1 is not a valid label: It should be one of ['high_bike_demand' 'low_bike_demand']