In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os

RANDOM_STATE = 31415

In [None]:
# metric to optimize
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

scorer = make_scorer(lambda y_test, predictions: np.sqrt(mean_squared_error(y_test, predictions)))

In [None]:
training_set = pd.read_csv('../input/train.csv')

In [None]:
training_set.head()

In [None]:
training_set.plot(x = 'datetime', y = 'casual')

In [None]:
training_set.plot(x = 'datetime', y = 'registered')

What do we see? The number of registed users grows over time. My first idea is that I may need to do some time series analisys in case of this variable. The number of casual users is completly different. There must be something that affects the number of casual users but it does not grow over time. There are spikes but it goes back to the normal value.

let's look at a correlation plot

In [None]:
corr = training_set.corr()
fig, ax = plt.subplots(figsize=(30, 30))
ax.matshow(corr)

for (i, j), z in np.ndenumerate(corr):
    ax.text(j, i, '{:0.1f}'.format(z), ha='center', va='center',
            bbox=dict(boxstyle='round', facecolor='white', edgecolor='0.3'))

plt.xticks(range(len(corr.columns)), corr.columns);
plt.yticks(range(len(corr.columns)), corr.columns);

Looks like I may try to use 'workday', 'temp', 'atemp', and 'humidity' to predict the number of casual users. The problem is 'temp' and 'atemp' are correlated with each other, so I must decide which one is redundant an duse only one of them.

In the first attempt I am going to normalize all values, remove datetime and atemp + registered and count columns. Then I am going to use linear regression to predict the value of the 'casual' variable. I will use regularization so hopefully it will sort out the problem of uncorrelated variables which I did not remove yet. Anyone that is the starting point just to set the baseline.

There is one more thing I have to do. Despite being numeric columns, some of the columns are in fact categorical variables. I must encode them using one-hot encoding instead of pretending that for example weather is a number ;)

It may be tempting to leave workday and holiday unencoded because they aready contain only 0 and 1. It will not end up well in case of linear regression. If x = 0 means "not a holiday" there is no value of the weight that may produce something not equal to 0 ;)

In [None]:
from sklearn.model_selection import train_test_split

# Basic preprocessing which applies to all regression techniques (dependent variable: casual)
data = training_set.drop(columns = ['datetime', 'atemp', 'registered', 'count'])

X_train, X_test, y_train, y_test = train_test_split(data, data.casual, test_size=0.2, random_state = RANDOM_STATE)
X_train = X_train.drop(columns = ['casual'])
X_test = X_test.drop(columns = ['casual'])

In [None]:
# Preprocessing for linear regression

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

one_hot = OneHotEncoder(categorical_features = [0, 1, 2, 3]) #season, holiday, workingday and weather
X_train_norm = one_hot.fit_transform(X_train_norm)
X_test_norm = one_hot.transform(X_test_norm)

In [None]:
from sklearn.linear_model import Lasso

In [None]:
from sklearn.model_selection import cross_val_score
casual_model = Lasso()
scores = cross_val_score(casual_model, X_train_norm, y_train, cv=5, scoring = scorer)
scores

In [None]:
casual_model.fit(X_train_norm, y_train)

In [None]:
# Same thing for the second variable
# Basic preprocessing which applies to all regression techniques (dependent variable: casual)
data = training_set.drop(columns = ['datetime', 'atemp', 'casual', 'count'])

X_train, X_test, y_train, y_test = train_test_split(data, data.registered, test_size=0.2, random_state = RANDOM_STATE)
X_train = X_train.drop(columns = ['registered'])
X_test = X_test.drop(columns = ['registered'])

In [None]:
# Preprocessing for linear regression

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.transform(X_test)

one_hot = OneHotEncoder(categorical_features = [0, 1, 2, 3]) #season, holiday, workingday and weather
X_train_norm = one_hot.fit_transform(X_train_norm)
X_test_norm = one_hot.transform(X_test_norm)

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score
registered_model = Lasso()
scores = cross_val_score(registered_model, X_train_norm, y_train, cv=5, scoring = scorer)
scores

In [None]:
registered_model.fit(X_train_norm, y_train)

In [None]:
# Final prediction of the baseline models, as I am not going to tweak them, I will move directly to the test data

test_dataset = pd.read_csv("../input/test.csv")

In [None]:
test_data = test_dataset.drop(columns = ['datetime', 'atemp'])
test_data = scaler.transform(test_data)
test_data = one_hot.transform(test_data)

In [None]:
casual = casual_model.predict(test_data)
registered = registered_model.predict(test_data)
total = casual + registered

In [None]:
test_dataset['count'] = pd.Series(total)

In [None]:
test_dataset[test_dataset['count'] < 0]

In [None]:
test_dataset.loc[test_dataset['count'] < 0, 'count'] = 0

In [None]:
test_dataset[test_dataset['count'] <= 0]

In [None]:
test_dataset[['datetime', 'count']].to_csv('result.csv', index = False)