In [1]:
# importing base libraries 
import pandas as pd # for data frames
import numpy as np # for matrix maths 

# sklearn modules 
from sklearn.cross_validation import train_test_split # to split data sets 
from sklearn.cross_validation import ShuffleSplit # to split in grid search 
from sklearn.metrics import make_scorer # to make scorer function 
from sklearn.grid_search import GridSearchCV # for parameter tunning 
from sklearn.neighbors import KNeighborsClassifier # Knn classifier class 



In [2]:
# loading tracks data sets 

# loading go_tracks_data in DF
go_tracks = pd.read_csv('./GPSTrajectory/go_track_tracks.csv')

# printing shape of dataset 
print("The shape of data set is {}".format(go_tracks.shape))

# head of go_tracks data 
go_tracks.head()

The shape of data set is (163, 10)


Unnamed: 0,id,id_android,speed,time,distance,rating,rating_bus,rating_weather,car_or_bus,linha
0,1,0,19.210586,0.138049,2.652,3,0,0,1,
1,2,0,30.848229,0.171485,5.29,3,0,0,1,
2,3,1,13.560101,0.067699,0.918,3,0,0,2,
3,4,1,19.766679,0.389544,7.7,3,0,0,2,
4,8,0,25.807401,0.154801,3.995,2,0,0,1,


In [3]:
# loading tracks points 

# tracks points in DF 
go_tracks_points = pd.read_csv('./GPSTrajectory/go_track_trackspoints.csv')

# printing shape of track points 
print("The shape of points data set is {}".format(go_tracks_points.shape))

go_tracks_points.head()

The shape of points data set is (18107, 5)


Unnamed: 0,id,latitude,longitude,track_id,time
0,1,-10.939341,-37.062742,1,2014-09-13 07:24:32
1,2,-10.939341,-37.062742,1,2014-09-13 07:24:37
2,3,-10.939324,-37.062765,1,2014-09-13 07:24:42
3,4,-10.939211,-37.062843,1,2014-09-13 07:24:47
4,5,-10.938939,-37.062879,1,2014-09-13 07:24:53


In [5]:
# features of track points 
features_points = go_tracks_points[['latitude','longitude']]

# output of track points
output_points = go_tracks_points.track_id

# breaking data set into train test track_points 
X_points_train,X_points_test,y_points_train, y_points_test = train_test_split(features_points,output_points)

In [35]:
# applying grid search for model selection 
# from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import accuracy_score

model_points = KNeighborsRegressor(leaf_size=50, n_neighbors=10,n_jobs=3)
model_points.fit(X_points_train,y_points_train)

def model_fit(X,y):
    cv_set = ShuffleSplit(X.shape[0],test_size=0.2,random_state=42)
    
    score_fun = make_scorer(accuracy_score)
    
    knn = KNeighborsClassifier()
    
    params = {
        'leaf_size':list(range(30,50)),
        'n_neighbors':list(range(5,10)),
        'n_jobs':list(range(3,5))
    }
    
    grid = GridSearchCV(knn,scoring=score_fun, param_grid=params,cv=cv_set)
    
    grid.fit(X,y)
    
    return grid.best_estimator_

In [None]:
# pred = model_points.predict(X_points_test)
# from sklearn.metrics import r2_score

# r2_score(y_points_test,pred)

mod = model_fit(X_points_train,y_points_train)
print(mod)

pred = mod.prdict(X_points_test)

accuracy_score(pred,y_points_test)

In [17]:
from sklearn.ensemble import RandomForestRegressor
mp = RandomForestRegressor(n_estimators=250)

mp.fit(X_points_train,y_points_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [19]:
pred =  mp.predict(X_points_test)
from sklearn.metrics import r2_score
# accuracy_score(pred,y_points_test)
r2_score(y_points_test,pred)

0.6296076735550935