In [1]:
import numpy as np
import pandas as pd

In [2]:
data_path = '../data/'
hour = pd.read_csv(data_path+'hour.csv')

In [3]:
important_categorical = ['yr','mnth','weathersit']

In [4]:
hour.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


#### Data Processing:
* categorical conversion
* numeric to categorical conversion
* numeric scaling
* conversion of correlated features
* train validation test split

In [5]:
categorical_features = important_categorical
print('categorical features: {}'.format(categorical_features))
target_features = ['cnt','casual','registered']
numeric_features = [col for col in hour.columns if len(hour[col].unique())>31]
numeric_features.remove('cnt')
numeric_features.remove('casual')
numeric_features.remove('registered')
print('numeric features :{}'.format(numeric_features))
print('target:{} '.format(target_features))

categorical features: ['yr', 'mnth', 'weathersit']
numeric features :['instant', 'dteday', 'temp', 'atemp', 'hum']
target:['cnt', 'casual', 'registered'] 


##### cat conversion

In [6]:
for col in categorical_features:
    dummies = pd.get_dummies(hour[col], prefix=col, drop_first=False)
    hour = pd.concat([hour, dummies], axis=1)

In [7]:
hour.columns

Index(['instant', 'dteday', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'registered', 'cnt', 'yr_0', 'yr_1', 'mnth_1', 'mnth_2',
       'mnth_3', 'mnth_4', 'mnth_5', 'mnth_6', 'mnth_7', 'mnth_8', 'mnth_9',
       'mnth_10', 'mnth_11', 'mnth_12', 'weathersit_1', 'weathersit_2',
       'weathersit_3', 'weathersit_4'],
      dtype='object')

##### numeric to categorical conversion

In [8]:
def assign_hum(x):
    if x>0.75:
        return 3
    elif x>0.5:
        return 2
    elif x>0.25:
        return 1
    else: 
        return 0
hour['hum1'] = hour.hum.apply(assign_hum)
hour['windspeed1'] = hour.windspeed.apply(assign_hum)

In [9]:
for col in ['hum1','windspeed1']:
    dummies = pd.get_dummies(hour[col], prefix=col, drop_first=False)
    hour = pd.concat([hour, dummies], axis=1)

In [10]:
hour.columns

Index(['instant', 'dteday', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'registered', 'cnt', 'yr_0', 'yr_1', 'mnth_1', 'mnth_2',
       'mnth_3', 'mnth_4', 'mnth_5', 'mnth_6', 'mnth_7', 'mnth_8', 'mnth_9',
       'mnth_10', 'mnth_11', 'mnth_12', 'weathersit_1', 'weathersit_2',
       'weathersit_3', 'weathersit_4', 'hum1', 'windspeed1', 'hum1_0',
       'hum1_1', 'hum1_2', 'hum1_3', 'windspeed1_0', 'windspeed1_1',
       'windspeed1_2', 'windspeed1_3'],
      dtype='object')

##### Scaling

In [11]:
quant_features = ['casual', 'registered', 'cnt', 'temp', 'hum', 'windspeed']
# Store scalings in a dictionary so we can convert back later
scaled_features = {}
for col in quant_features:
    hour[col] = (hour[col]-np.mean(hour[col]))/np.std(hour[col])


In [12]:
hour[quant_features].head()

Unnamed: 0,casual,registered,cnt,temp,hum,windspeed
0,-0.662755,-0.930189,-0.956339,-1.334648,0.947372,-1.553889
1,-0.561343,-0.804655,-0.824022,-1.438516,0.895539,-1.553889
2,-0.62219,-0.83769,-0.868128,-1.438516,0.895539,-1.553889
3,-0.662755,-0.95001,-0.972879,-1.334648,0.63637,-1.553889
4,-0.723603,-1.009474,-1.039037,-1.334648,0.63637,-1.553889


In [13]:
hour.columns

Index(['instant', 'dteday', 'season', 'yr', 'mnth', 'hr', 'holiday', 'weekday',
       'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed',
       'casual', 'registered', 'cnt', 'yr_0', 'yr_1', 'mnth_1', 'mnth_2',
       'mnth_3', 'mnth_4', 'mnth_5', 'mnth_6', 'mnth_7', 'mnth_8', 'mnth_9',
       'mnth_10', 'mnth_11', 'mnth_12', 'weathersit_1', 'weathersit_2',
       'weathersit_3', 'weathersit_4', 'hum1', 'windspeed1', 'hum1_0',
       'hum1_1', 'hum1_2', 'hum1_3', 'windspeed1_0', 'windspeed1_1',
       'windspeed1_2', 'windspeed1_3'],
      dtype='object')

In [14]:
hour = hour.drop(['instant','season','yr','mnth','hr','holiday','weekday','workingday','weathersit','atemp','hum','hum1','windspeed'],axis=1)

In [15]:
hour.columns

Index(['dteday', 'temp', 'casual', 'registered', 'cnt', 'yr_0', 'yr_1',
       'mnth_1', 'mnth_2', 'mnth_3', 'mnth_4', 'mnth_5', 'mnth_6', 'mnth_7',
       'mnth_8', 'mnth_9', 'mnth_10', 'mnth_11', 'mnth_12', 'weathersit_1',
       'weathersit_2', 'weathersit_3', 'weathersit_4', 'windspeed1', 'hum1_0',
       'hum1_1', 'hum1_2', 'hum1_3', 'windspeed1_0', 'windspeed1_1',
       'windspeed1_2', 'windspeed1_3'],
      dtype='object')

##### test train split

In [16]:
# Save data for approximately the last 21 days 
test_data = hour[-21*24:]

# Now remove the test data from the data set 
data = hour[:-21*24]

# Separate the data into features and targets
target_fields = ['cnt', 'casual', 'registered']
features, targets = data.drop(target_fields, axis=1), data[target_fields]
test_features, test_targets = test_data.drop(target_fields, axis=1), test_data[target_fields]

##### validation split

In [17]:
# Hold out the last 60 days or so of the remaining data as a validation set
train_features, train_targets = features[:-60*24], targets[:-60*24]
val_features, val_targets = features[-60*24:], targets[-60*24:]

##### save data sets

In [18]:
def save_(df,save_name):
    df.to_csv('../data/{}.csv'.format(save_name),index=False)

In [19]:
save_(train_features,'train_feautures')

In [20]:
save_(train_targets,'train_targets')

In [21]:
save_(val_features,'val_feautures')

In [22]:
save_(val_targets,'val_targets')

In [23]:
save_(test_features,'test_feautures')

In [24]:
save_(test_targets,'test_targets')