### This notebook tries to predict the share count of bikes based on the weather condition.

In [None]:
import numpy as np 
import pandas as pd 
from random import randint
from subprocess import check_output
from datetime import datetime
from sklearn.preprocessing import scale
from sklearn import linear_model
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn import linear_model

In [None]:
weather_df = pd.read_csv('../input/weather.csv')
trip_df = pd.read_csv('../input/trip.csv')

#### Data Preprocessing

In [None]:
weather_df['date'] = weather_df['date'].apply(lambda x: datetime.strptime(x, "%m/%d/%Y"))
trip_df['start_date'] = trip_df['start_date'].apply(lambda x: datetime.strptime(x, "%m/%d/%Y %H:%M"))

In [None]:
def sliceWeekdayAndWeekend(df, on='date'):
    weekday_mask = df[on].weekday() < 5
    weekend_mask = df[on].weekday() >= 5
    return df.loc[weekday_mask], df.loc[weekend_mask]

In [None]:
def sliceGoodAndBadWeatherDay(df):
    bad_weather_mask = df['events'] < 0
    good_weather_mask = df['events'] >= 0

    return df.loc[bad_weather_mask], df.loc[good_weather_mask]

In [None]:
def convertEventToInt(val):
    if val is np.nan:
        return 0
    elif ('Rain' in val) or ('Thunder' in val):
        return -1
    else:
        return 2

In [None]:
trip = trip_df.copy()
weather = weather_df.copy()

In [None]:
weather['events'] = weather['events'].apply(convertEventToInt)

In [None]:
weather['precipitation_inches'] = weather['precipitation_inches'].apply(lambda x: 0.01 if x == 'T' else x)
weather['precipitation_inches'] = weather['precipitation_inches'].astype('float64')

trip['date'] = trip['start_date'].apply(lambda x: x.date())
weather['date'] = weather['date'].apply(lambda x: x.date())

weather['zip_code'] = weather['zip_code'].astype('str')

In [None]:
weather.fillna(weather.mean(), inplace=True)

#### Sum up share count with same day and same zip_code.

In [None]:
count_per_day = trip.groupby(['date', 'zip_code']).size()
count_per_day.rename('count', inplace=True)
count_per_day = count_per_day.to_frame().reset_index()

In [None]:
whole_dataset = weather.merge(count_per_day, on=['date', 'zip_code'])

In [None]:
whole_dataset['isWeekend'] = whole_dataset['date'].apply(lambda x: False if x.weekday() < 5 else True)

#### The prediction will work with weekday data

In [None]:
weekday_df = whole_dataset[whole_dataset['isWeekend'] == False]

In [None]:
bad_weather, good_weather = sliceGoodAndBadWeatherDay(weekday_df)

In [None]:
bad_weather.drop(['date','zip_code', 'isWeekend', 'events'], axis=1, inplace=True)
good_weather.drop(['date','zip_code', 'isWeekend', 'events'], axis=1, inplace=True)

In [None]:
def sliceXandY(df):
    x = df.ix[:, :'wind_dir_degrees']
    y = df.ix[:, 'count']
    return x, y

In [None]:
def sampleDataset(df, in_frac=0.12, in_random_state=22):
    return df.sample(frac=in_frac, random_state=in_random_state)

#### Sample good weather due to imbalanced

In [None]:
good_weather_sample = sampleDataset(good_weather, in_frac=0.12, in_random_state=randint(0,32767))

In [None]:
learning_dataset = pd.concat([good_weather_sample, bad_weather])

In [None]:
x, y = sliceXandY(learning_dataset)

### Standardize learning_dataset, then use the dataset to generate training & testing data.

In [None]:
Xs_train, Xs_test, y_train, y_test = train_test_split(scale(x), y, test_size=0.2, random_state=randint(0,32767))

### Feature select and predict the share count with Lasso Regression. 

In [None]:
lasso_model = linear_model.Lasso()

In [None]:
lasso_model.fit(Xs_train, y_train)

In [None]:
lasso_model.coef_

In [None]:
lasso_model.score(Xs_train, y_train)