# TP2: Machine Learning

In [1]:
import pandas as pd

# import ML packages
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn import linear_model
from sklearn import preprocessing

# import Plotting pckgs
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
trainingSet = pd.read_csv('../CSVs/improved_trip_train.csv')
testingSet = pd.read_csv('../CSVs/improved_trip_test.csv')
testingOriginal = pd.read_csv('../CSVs/trip_test.csv')
trainingSet.dtypes

id                     int64
duration               int64
start_date            object
start_dayOfWeek        int64
start_week             int64
start_quarter          int64
start_time            object
start_hour             int64
start_minute           int64
start_station_name    object
start_station_id       int64
end_date              object
end_dayOfWeek          int64
end_week               int64
end_quarter            int64
end_time              object
end_hour               int64
end_minute             int64
end_station_name      object
end_station_id         int64
bike_id                int64
subscription_type     object
zip_code              object
dtype: object

In [3]:
trainingSet.head()

Unnamed: 0,id,duration,start_date,start_dayOfWeek,start_week,start_quarter,start_time,start_hour,start_minute,start_station_name,...,end_week,end_quarter,end_time,end_hour,end_minute,end_station_name,end_station_id,bike_id,subscription_type,zip_code
0,907649,396,2015-08-27,3,35,3,08:36:00,8,36,Harry Bridges Plaza (Ferry Building),...,35,3,08:43:00,8,43,Embarcadero at Sansome,60,187,Subscriber,94602
1,384043,636,2014-07-28,0,31,3,22:06:00,22,6,Market at 10th,...,31,3,22:17:00,22,17,Washington at Kearny,46,417,Subscriber,94133
2,316176,334,2014-06-09,0,24,2,08:42:00,8,42,Market at Sansome,...,24,2,08:47:00,8,47,2nd at Folsom,62,281,Subscriber,94107
3,618874,666,2015-01-26,0,5,1,16:55:00,16,55,San Francisco Caltrain 2 (330 Townsend),...,5,1,17:07:00,17,7,Temporary Transbay Terminal (Howard at Beale),55,634,Subscriber,94602
4,910977,318,2015-08-29,5,35,3,15:09:00,15,9,Market at 10th,...,35,3,15:14:00,15,14,Powell Street BART,39,607,Subscriber,94709


In [4]:
trainingSet.shape

(549961, 23)

In [5]:
testingSet.head()

Unnamed: 0,id,start_date,start_dayOfWeek,start_week,start_quarter,start_time,start_hour,start_minute,start_station_name,start_station_id,...,end_week,end_quarter,end_time,end_hour,end_minute,end_station_name,end_station_id,bike_id,subscription_type,zip_code
0,504737,2014-10-18,5,42,4,11:25:00,11,25,Embarcadero at Sansome,60,...,42,4,12:12:00,12,12,Powell at Post (Union Square),71,426,Customer,77009
1,530846,2014-11-05,2,45,4,13:00:00,13,0,Embarcadero at Folsom,51,...,45,4,13:09:00,13,9,Broadway St at Battery St,82,454,Subscriber,94132
2,813140,2015-06-18,3,25,2,17:34:00,17,34,San Francisco Caltrain (Townsend at 4th),70,...,25,2,17:37:00,17,37,2nd at Townsend,61,370,Subscriber,94107
3,897674,2015-08-20,3,34,3,07:06:00,7,6,Civic Center BART (7th at Market),72,...,34,3,07:15:00,7,15,Townsend at 7th,65,451,Subscriber,94582
4,322830,2014-06-13,4,24,2,08:46:00,8,46,San Francisco Caltrain 2 (330 Townsend),69,...,24,2,08:57:00,8,57,Embarcadero at Folsom,51,603,Subscriber,95014


In [6]:
trainingSet.duration.mean()

1113.4153549069842

### Let's study correlation between data

In [7]:
trainingSet.corr()['duration']

id                 -0.003133
duration            1.000000
start_dayOfWeek     0.018351
start_week          0.004002
start_quarter       0.003648
start_hour          0.002598
start_minute        0.002586
start_station_id   -0.007001
end_dayOfWeek       0.013753
end_week            0.002100
end_quarter         0.001376
end_hour            0.009626
end_minute         -0.000530
end_station_id     -0.006026
bike_id            -0.001569
Name: duration, dtype: float64

We can see that the duration is most correlated with:
    - start_dayOfWeek     0.018351
    - end_dayOfWeek       0.013753
    - end_hour            0.009626
    - start_station_id   -0.007001

In [8]:
columns = trainingSet.columns.tolist()
columns

['id',
 'duration',
 'start_date',
 'start_dayOfWeek',
 'start_week',
 'start_quarter',
 'start_time',
 'start_hour',
 'start_minute',
 'start_station_name',
 'start_station_id',
 'end_date',
 'end_dayOfWeek',
 'end_week',
 'end_quarter',
 'end_time',
 'end_hour',
 'end_minute',
 'end_station_name',
 'end_station_id',
 'bike_id',
 'subscription_type',
 'zip_code']

In [9]:
trainingSet.dtypes

id                     int64
duration               int64
start_date            object
start_dayOfWeek        int64
start_week             int64
start_quarter          int64
start_time            object
start_hour             int64
start_minute           int64
start_station_name    object
start_station_id       int64
end_date              object
end_dayOfWeek          int64
end_week               int64
end_quarter            int64
end_time              object
end_hour               int64
end_minute             int64
end_station_name      object
end_station_id         int64
bike_id                int64
subscription_type     object
zip_code              object
dtype: object

In [10]:
# Create linear regression object
regr = linear_model.LinearRegression()

In [11]:
trainingSet.shape

(549961, 23)

In [12]:
# Train the model using the training sets
regr.fit(trainingSet[['start_dayOfWeek', 'end_dayOfWeek', 'end_hour', 'start_station_id']], trainingSet.duration)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [13]:
prediction = regr.predict(testingSet[['start_dayOfWeek', 'end_dayOfWeek', 'end_hour', 'start_station_id']])

In [14]:
prediction

array([ 1603.57291184,  1069.66886328,  1298.9958606 , ...,  1784.36773396,
        1544.50655641,   596.68919234])

In [40]:
duration = { 
    'id' : testingSet.id,
    'duration' : prediction}
df_duration = pd.DataFrame(duration, columns = ['id','duration'])
df_duration = pd.merge(df_duration,testingOriginal,on = 'id',how = 'inner')
df_duration_bis = pd.DataFrame(duration, columns = ['id','duration'])
df_duration_bis = pd.merge(df_duration_bis,testingSet,on= 'id',how ='inner')
df_duration_bis = df_duration_bis.loc[:,['id','duration','start_dayOfWeek', 'end_dayOfWeek', 'end_hour', 'start_station_id']]

In [41]:
df_duration

Unnamed: 0,id,duration,start_date,start_station_name,start_station_id,end_date,end_station_name,end_station_id,bike_id,subscription_type,zip_code
0,504737,1603.572912,10/18/2014 11:25,Embarcadero at Sansome,60,10/18/2014 12:12,Powell at Post (Union Square),71,426,Customer,77009
1,530846,1069.668863,11/5/2014 13:00,Embarcadero at Folsom,51,11/5/2014 13:09,Broadway St at Battery St,82,454,Subscriber,94132
2,813140,1298.995861,6/18/2015 17:34,San Francisco Caltrain (Townsend at 4th),70,6/18/2015 17:37,2nd at Townsend,61,370,Subscriber,94107
3,897674,836.063931,8/20/2015 7:06,Civic Center BART (7th at Market),72,8/20/2015 7:15,Townsend at 7th,65,451,Subscriber,94582
4,322830,1126.487657,6/13/2014 8:46,San Francisco Caltrain 2 (330 Townsend),69,6/13/2014 8:57,Embarcadero at Folsom,51,603,Subscriber,95014
5,487841,1118.016993,10/7/2014 21:41,2nd at Townsend,61,10/7/2014 21:47,Post at Kearny,47,478,Subscriber,94115
6,677808,1150.710172,3/11/2015 18:09,Market at 10th,67,3/11/2015 18:22,San Francisco Caltrain (Townsend at 4th),70,505,Subscriber,94025
7,704449,835.598940,3/30/2015 17:29,Embarcadero at Vallejo,48,3/30/2015 17:35,Steuart at Market,74,356,Subscriber,94536
8,833587,1760.988055,7/5/2015 11:54,Market at 10th,67,7/5/2015 12:00,Market at 4th,76,401,Subscriber,94102
9,420411,1534.923806,8/22/2014 13:30,Embarcadero at Vallejo,48,8/22/2014 13:41,5th at Howard,57,363,Subscriber,94114


In [42]:
df_duration_bis

Unnamed: 0,id,duration,start_dayOfWeek,end_dayOfWeek,end_hour,start_station_id
0,504737,1603.572912,5,5,12,60
1,530846,1069.668863,2,2,13,51
2,813140,1298.995861,3,3,17,70
3,897674,836.063931,3,3,7,72
4,322830,1126.487657,4,4,8,69
5,487841,1118.016993,1,1,21,61
6,677808,1150.710172,2,2,18,67
7,704449,835.598940,0,0,17,48
8,833587,1760.988055,6,6,12,67
9,420411,1534.923806,4,4,13,48


In [None]:
df_duration.to_csv('../CSVs/predicted_duration.csv', index=False)
df_duration_bis.to_csv('../CSVs/predicted_duration_bis.csv', index=False)