In [21]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import datetime
from math import radians, cos, sin, asin, sqrt 
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, validation_curve, GridSearchCV

In [22]:
train_df = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [23]:
train_df

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.964630,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.010040,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.782520,N,435
...,...,...,...,...,...,...,...,...,...,...,...
1458639,id2376096,2,2016-04-08 13:31:04,2016-04-08 13:44:02,4,-73.982201,40.745522,-73.994911,40.740170,N,778
1458640,id1049543,1,2016-01-10 07:35:15,2016-01-10 07:46:10,1,-74.000946,40.747379,-73.970184,40.796547,N,655
1458641,id2304944,2,2016-04-22 06:57:41,2016-04-22 07:10:25,1,-73.959129,40.768799,-74.004433,40.707371,N,764
1458642,id2714485,1,2016-01-05 15:56:26,2016-01-05 16:02:39,1,-73.982079,40.749062,-73.974632,40.757107,N,373


In [24]:
def drop_val(df):
    df = df.drop("vendor_id", 1)
    df = df.drop("store_and_fwd_flag", 1)
    return df
train_df = drop_val(train_df)
test = drop_val(test)

In [25]:
def to_tstamp(df):
    df["pickup_datetime"] =  pd.to_datetime(df["pickup_datetime"])
    df["dropoff_datetime"] =  pd.to_datetime(df["dropoff_datetime"])
    df["pickup_datetime"] = df["pickup_datetime"].map(lambda x: x.timestamp()).astype(int)
    df["dropoff_datetime"] = df["dropoff_datetime"].map(lambda x: x.timestamp()).astype(int)
    return df
def to_tstamp_test(df):
    df["pickup_datetime"] =  pd.to_datetime(df["pickup_datetime"])
    df["pickup_datetime"] = df["pickup_datetime"].map(lambda x: x.timestamp()).astype(int)
    return df
train_df = to_tstamp(train_df)
test = to_tstamp_test(test)
train_df

Unnamed: 0,id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
0,id2875421,1457976295,1457976750,1,-73.982155,40.767937,-73.964630,40.765602,455
1,id2377394,1465692215,1465692878,1,-73.980415,40.738564,-73.999481,40.731152,663
2,id3858529,1453203324,1453205448,1,-73.979027,40.763939,-74.005333,40.710087,2124
3,id3504673,1459971151,1459971580,1,-74.010040,40.719971,-74.012268,40.706718,429
4,id2181028,1458999055,1458999490,1,-73.973053,40.793209,-73.972923,40.782520,435
...,...,...,...,...,...,...,...,...,...
1458639,id2376096,1460122264,1460123042,4,-73.982201,40.745522,-73.994911,40.740170,778
1458640,id1049543,1452411315,1452411970,1,-74.000946,40.747379,-73.970184,40.796547,655
1458641,id2304944,1461308261,1461309025,1,-73.959129,40.768799,-74.004433,40.707371,764
1458642,id2714485,1452009386,1452009759,1,-73.982079,40.749062,-73.974632,40.757107,373


In [26]:
train_df

Unnamed: 0,id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
0,id2875421,1457976295,1457976750,1,-73.982155,40.767937,-73.964630,40.765602,455
1,id2377394,1465692215,1465692878,1,-73.980415,40.738564,-73.999481,40.731152,663
2,id3858529,1453203324,1453205448,1,-73.979027,40.763939,-74.005333,40.710087,2124
3,id3504673,1459971151,1459971580,1,-74.010040,40.719971,-74.012268,40.706718,429
4,id2181028,1458999055,1458999490,1,-73.973053,40.793209,-73.972923,40.782520,435
...,...,...,...,...,...,...,...,...,...
1458639,id2376096,1460122264,1460123042,4,-73.982201,40.745522,-73.994911,40.740170,778
1458640,id1049543,1452411315,1452411970,1,-74.000946,40.747379,-73.970184,40.796547,655
1458641,id2304944,1461308261,1461309025,1,-73.959129,40.768799,-74.004433,40.707371,764
1458642,id2714485,1452009386,1452009759,1,-73.982079,40.749062,-73.974632,40.757107,373


In [27]:


def haversine(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    km = 6367 * c
    return km

train_df['distance'] = train_df.apply(lambda x: haversine(x['pickup_longitude'], x['pickup_latitude'], x['dropoff_longitude'], x['dropoff_latitude']), axis=1)
test['distance'] = test.apply(lambda x: haversine(x['pickup_longitude'], x['pickup_latitude'], x['dropoff_longitude'], x['dropoff_latitude']), axis=1)

In [28]:
target = train_df["trip_duration"]

In [33]:
arr = train_df["distance"].array

Поделим выборку на обучающую (x_train, y_train) и отложенную (x_test, y_test)

In [30]:
itog_val = {}
train_df = train_df.drop("trip_duration", 1)
train_df = train_df.drop("id", 1)
x_train, x_test, y_train, y_test = train_test_split(train_df["distance"].array, target, test_size=0.01) 

In [31]:
train_df

Unnamed: 0,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,distance
0,1457976295,1457976750,1,-73.982155,40.767937,-73.964630,40.765602,1.497580
1,1465692215,1465692878,1,-73.980415,40.738564,-73.999481,40.731152,1.804374
2,1453203324,1453205448,1,-73.979027,40.763939,-74.005333,40.710087,6.381090
3,1459971151,1459971580,1,-74.010040,40.719971,-74.012268,40.706718,1.484566
4,1458999055,1458999490,1,-73.973053,40.793209,-73.972923,40.782520,1.187842
...,...,...,...,...,...,...,...,...
1458639,1460122264,1460123042,4,-73.982201,40.745522,-73.994911,40.740170,1.224311
1458640,1452411315,1452411970,1,-74.000946,40.747379,-73.970184,40.796547,6.046037
1458641,1461308261,1461309025,1,-73.959129,40.768799,-74.004433,40.707371,7.819693
1458642,1452009386,1452009759,1,-73.982079,40.749062,-73.974632,40.757107,1.091878


In [32]:
model_lr = LinearRegression(n_jobs=-1)
#model_dtr = DecisionTreeRegressor()
model_rfr = RandomForestRegressor(n_estimators = 20, n_jobs=-1)
#model_lr.fit(x_train, y_train)
#model_dtr.fit(x_train, y_train)
model_rfr.fit(x_train, y_train)
#print("LinearRegression:", model_lr.score(x_test, y_test))
#print("DecisionTreeRegressor:", model_dtr.score(x_test, y_test))
print("RandomForestRegressor:", model_rfr.score(x_test, y_test))

ValueError: Expected 2D array, got 1D array instead:
array=[ 7.696092  22.642883   1.928254  ...  2.2240791  8.212519   1.6602074].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.