## Megatutorial 4: Regression

In [1]:
from pandas import read_csv

# Transformer/Funktionen zur Vorverarbeitung
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Estimators für die Regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

# Metriken für die Regression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import root_mean_squared_error

## Daten laden

In [2]:
data = read_csv("../data/bikesharing.csv", index_col=0)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 731 entries, 0 to 730
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   season      731 non-null    object 
 1   holiday     731 non-null    object 
 2   weekday     731 non-null    object 
 3   workingday  731 non-null    object 
 4   weathersit  731 non-null    object 
 5   temp        731 non-null    float64
 6   atemp       731 non-null    float64
 7   hum         698 non-null    float64
 8   windspeed   731 non-null    float64
 9   casual      731 non-null    int64  
 10  registered  731 non-null    int64  
 11  cnt         731 non-null    int64  
 12  day         731 non-null    int64  
 13  month       731 non-null    int64  
 14  year        731 non-null    int64  
dtypes: float64(4), int64(6), object(5)
memory usage: 91.4+ KB


In [4]:
data.isna().any()

season        False
holiday       False
weekday       False
workingday    False
weathersit    False
temp          False
atemp         False
hum            True
windspeed     False
casual        False
registered    False
cnt           False
day           False
month         False
year          False
dtype: bool

# Preprocessing
## Fehlende Werte

In [5]:
imputer_engine = SimpleImputer(strategy="median")

In [6]:
data[["hum"]]

Unnamed: 0,hum
0,0.805833
1,0.696087
2,0.437273
3,0.590435
4,0.436957
...,...
726,0.652917
727,0.590000
728,0.752917
729,


In [7]:
imputer_engine.fit(data[["hum"]])

In [8]:
data ["hum"] = imputer_engine.transform(data[["hum"]])

# Target / Feature auswählen

In [9]:
data.columns

Index(['season', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp',
       'atemp', 'hum', 'windspeed', 'casual', 'registered', 'cnt', 'day',
       'month', 'year'],
      dtype='object')

In [10]:
features = [
    'season', 'holiday', 'weekday', 'weathersit', 'temp',
    'atemp', 'hum', 'windspeed', 'month'
]

target = [
    'cnt'
]

X = data[features]
y = data[target]

### Label Encoding

In [11]:
X.select_dtypes(exclude="number").info()

<class 'pandas.core.frame.DataFrame'>
Index: 731 entries, 0 to 730
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   season      731 non-null    object
 1   holiday     731 non-null    object
 2   weekday     731 non-null    object
 3   weathersit  731 non-null    object
dtypes: object(4)
memory usage: 28.6+ KB


In [12]:
season_encoder_engine = LabelEncoder()
X.loc[:, "season"] = season_encoder_engine.fit_transform(X["season"])

holiday_encoder_engine = LabelEncoder()
X.loc[:, "holiday"] = holiday_encoder_engine.fit_transform(X["holiday"])

weekday_encoder_engine = LabelEncoder()
X.loc[:, "weekday"] = weekday_encoder_engine.fit_transform(X["weekday"])

weathersit_encoder_engine = LabelEncoder()
X.loc[:, "weathersit"] = weathersit_encoder_engine.fit_transform(X["weathersit"])

### Hold-Out-Resampling

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2
    )

## Modelling

In [14]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

predictions = linear_model.predict(X_test)

print(
    "R2", r2_score(y_test, predictions),
    "RMSE", root_mean_squared_error(y_test, predictions),
    "MAE", mean_absolute_error(y_test, predictions)
)

R2 0.39963066565230176 RMSE 1493.5723145741429 MAE 1243.4322260852728


In [17]:
tree_model = DecisionTreeRegressor()
tree_model.fit(X_train, y_train)

predictions = tree_model.predict(X_test)

print(
    "R2", r2_score(y_test, predictions),
    "RMSE", root_mean_squared_error(y_test, predictions),
    "MAE", mean_absolute_error(y_test, predictions)
)

R2 0.2555444795035452 RMSE 1663.168999966323 MAE 1268.455782312925
