In [2]:
# process categorical variables using one-hot encoding

import pandas as pd

# Load the data
data_df = pd.read_csv("c3_bike-small.csv")

# First five rows
data_df.head()

Unnamed: 0,temp,weekday,casual
0,0.344,6,331
1,0.363,0,131
2,0.196,1,120
3,0.2,2,108
4,0.227,3,82


In [3]:
data_df.weekday.value_counts()

6    105
0    105
1    105
2    104
3    104
4    104
5    104
Name: weekday, dtype: int64

In [4]:
# Create X/y data
X = data_df[["temp"]].values
y = data_df.casual.values

# Fit a linear regression
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X, y)
lr.score(X, y)   # R^2

0.29525001840042775

In [11]:
# Create X/y data
X = data_df[["temp", "weekday"]].values
y = data_df.casual.values

# Fit a linear regression
lr = LinearRegression()
lr.fit(X, y)
lr.score(X, y)

0.2988588081409286

R^2 not affected by addition of new variable however is because of format

In [12]:
pd.get_dummies(data_df, columns=["weekday"]).head()

Unnamed: 0,temp,casual,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
0,0.344,331,0,0,0,0,0,0,1
1,0.363,131,1,0,0,0,0,0,0
2,0.196,120,0,1,0,0,0,0,0
3,0.2,108,0,0,1,0,0,0,0
4,0.227,82,0,0,0,1,0,0,0


In [13]:
# One-hot encoding
encoded_df = pd.get_dummies(data_df, columns=["weekday"])

# Create X/y data
X = encoded_df.drop(["casual"], axis=1).values
y = encoded_df.casual.values

# Fit a linear regression
lr = LinearRegression()
lr.fit(X, y)
lr.score(X, y)

0.5969174988134782

In [14]:
## Note that each category now has a redundant column. We can drop it with the drop_first attribute.

# One-hot encoding
encoded_df = pd.get_dummies(data_df, columns=["weekday"], drop_first=True)
encoded_df.head()

Unnamed: 0,temp,casual,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
0,0.344,331,0,0,0,0,0,1
1,0.363,131,0,0,0,0,0,0
2,0.196,120,1,0,0,0,0,0
3,0.2,108,0,1,0,0,0,0
4,0.227,82,0,0,1,0,0,0


In [15]:
# verify didnt lose perf

# Create X/y data
X = encoded_df.drop(["casual"], axis=1).values
y = encoded_df.casual.values

# Fit a linear regression
lr = LinearRegression()
lr.fit(X, y)
lr.score(X, y)

0.5969174988134782