In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [21]:
df_raw = pd.read_csv('./data/train.csv', sep=';')

In [22]:
df = df_raw.drop(columns=['id', 'year'])


In [23]:
categorical_cols = ['season', 'weather', 'holiday', 'workingday']

# Use ColumnTransformer to encode categorical variables
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(), categorical_cols)
], remainder='passthrough')

In [24]:
X = df.drop('count', axis=1)  # Features (input variables)
y = df['count'] 

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Create a pipeline to handle preprocessing and scaling
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler())  # Scaling the features to normalize the values
])

# Fit the pipeline on the training data
X_train_preprocessed = pipeline.fit_transform(X_train)
X_test_preprocessed = pipeline.transform(X_test)

In [26]:
X_train_preprocessed

array([[ 1.73261417, -0.57290736, -0.57966556, ..., -2.16042697,
        -1.48471753,  1.13470397],
       [-0.57716254, -0.57290736, -0.57966556, ..., -1.09351808,
        -1.48471753,  0.76744896],
       [-0.57716254, -0.57290736,  1.72513267, ...,  1.04029971,
        -0.19251707,  0.52233904],
       ...,
       [-0.57716254,  1.74548289, -0.57966556, ...,  0.32902711,
        -0.65770924, -0.82289636],
       [-0.57716254, -0.57290736,  1.72513267, ...,  1.48513851,
        -1.32965347, -0.2113512 ],
       [ 1.73261417, -0.57290736, -0.57966556, ..., -1.44915437,
        -0.14082905, -0.82289636]])

In [27]:
from sklearn.linear_model import LinearRegression

In [28]:
reg = LinearRegression().fit(X_train_preprocessed, y_train)
reg.score(X_test_preprocessed, y_test)

0.348674895212585

In [29]:
from sklearn.linear_model import LogisticRegression

In [30]:
clf = LogisticRegression(random_state=0).fit(X_train_preprocessed, y_train)
clf.score(X_test_preprocessed, y_test)


0.01950585175552666