In [24]:
%matplotlib inline

In [25]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from collections import Counter
from matplotlib import pyplot as plt
import numpy as np
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from pyearth import Earth
from sklearn.linear_model import LinearRegression

In [26]:
import pandas as pd
train_data = pd.read_csv('train.csv', parse_dates=['pickup_datetime', 'dropoff_datetime'])
test_data = pd.read_csv('test.csv', parse_dates=['pickup_datetime'])

In [27]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
id                    1458644 non-null object
vendor_id             1458644 non-null int64
pickup_datetime       1458644 non-null datetime64[ns]
dropoff_datetime      1458644 non-null datetime64[ns]
passenger_count       1458644 non-null int64
pickup_longitude      1458644 non-null float64
pickup_latitude       1458644 non-null float64
dropoff_longitude     1458644 non-null float64
dropoff_latitude      1458644 non-null float64
store_and_fwd_flag    1458644 non-null object
trip_duration         1458644 non-null int64
dtypes: datetime64[ns](2), float64(4), int64(3), object(2)
memory usage: 122.4+ MB


In [28]:
tocluster = train_data[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']].values

In [29]:
#scores = []
#for k in range(3, 16):
#    print("Current K is", k)
#    kmeans = KMeans(n_clusters=k, random_state=10)
#    labels = kmeans.fit(tocluster).labels_
#    scores.append(silhouette_score(tocluster, labels, metric='euclidean', sample_size=1000))
#plt.plot(range(3,16), scores)
#plt.show()

In [30]:
#kmeans = KMeans(n_clusters=8, random_state=10)

In [31]:
#cluster_labels = kmeans.fit(tocluster).labels_

In [32]:
#Counter(cluster_labels)

# Feature engeneering

Lets create some useful features from existing ones. At least, the following will be preferable:
1) distance between start and end points of the trip;
2) week of the day
3) month number
4) second of the current day

In [33]:
@np.vectorize
def distance_on_sphere(lat1, long1, lat2, long2):
    d2r = np.pi / 180.0
    phi1 = (90.0 - lat1) * d2r
    phi2 = (90.0 - lat2) * d2r
    # theta = долгота
    theta1 = long1 * d2r
    theta2 = long2 * d2r
    carc = (np.sin(phi1) * np.sin(phi2) * np.cos(theta1 - theta2) +
           np.cos(phi1) * np.cos(phi2))
    if np.abs(carc) > 1.0:
        return np.arccos(np.sign(carc) * 1.0)
    return np.arccos(carc)

In [34]:
def get_features(X):
    res = X.copy()
    distances = 6371 * distance_on_sphere(X[['pickup_latitude']].values,
                               X[['pickup_longitude']].values,
                               X[['dropoff_latitude']].values,
                               X[['dropoff_longitude']].values)

    res['dist'] = distances

    month = X['pickup_datetime'].map(lambda x: x.month)
    res['month'] = (month - month.min()) / (month.max() - month.min())
    
    wd = X['pickup_datetime'].map(lambda x: x.weekday())
    res['wdsin'] = np.sin(2 * np.pi * wd / 7.0)
    res['wdcos'] = np.cos(2 * np.pi * wd / 7.0)
    
    sc = X['pickup_datetime'].map(lambda x: x.hour * 3600.0 + x.minute * 60.0 + x.second)
    res['scsin'] = np.sin(2 * np.pi * sc / 86400.0)
    res['sccos'] = np.cos(2 * np.pi * sc / 86400.0)

    return res

# Detect and drop outliers

In [35]:
def filter_train_dataset(X): 
    strange_records1 = (X['dist'] < 0.1) & (X['trip_duration'] > 5000)
    strange_records2 = (X['dist'] < 100) & (X['trip_duration'] > 80000)
    return X.loc[~(strange_records1|strange_records2)]

In [36]:
to_analyze_filtered = filter_train_dataset(get_features(train_data))
y = to_analyze_filtered[['trip_duration']].values
X = to_analyze_filtered.drop(['store_and_fwd_flag','vendor_id', 'id', 'pickup_datetime', 'dropoff_datetime','trip_duration'], axis=1)


In [37]:
def clusterized_regressor(traindataset, testdataset=None):
    to_analyze_filtered = filter_train_dataset(get_features(traindataset))
    tocluster = to_analyze_filtered[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']].values
    kmeans = KMeans(n_clusters=1, random_state=10, n_jobs=-1)
    cluster_labels = kmeans.fit(tocluster).labels_
    clfs = {}
    y = to_analyze_filtered[['trip_duration']].values
    X = to_analyze_filtered.drop(['store_and_fwd_flag','vendor_id', 'id', 'pickup_datetime', 'dropoff_datetime','trip_duration'], axis=1).values
    for lb in np.unique(cluster_labels):
        if sum(cluster_labels==lb):
            clf = RandomForestRegressor(max_depth=20, n_estimators=100, n_jobs=3, random_state=10)
            _X, _y = X[cluster_labels==lb,:], y[cluster_labels==lb]
            clf.fit(_X, _y.ravel())
            clfs.update({lb: clf.predict})
        else:
            clfs.update({lb: lambda x: to_analyze_filtered.loc[cluster_labels==lb, 'trip_duration'].median()})
        print("Evaluating the lb: ", lb)
    if testdataset is not None:
        to_analyze_test = get_features(testdataset)
        tocluster = testdataset[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']].values
        test_labels = kmeans.predict(tocluster)
        X = to_analyze_test.drop(['store_and_fwd_flag', 'vendor_id', 'id', 'pickup_datetime'], axis=1).values
        predictions = []
        result = np.zeros(len(test_labels))
        for lb in np.unique(cluster_labels):
            result[lb==test_labels] = clfs[lb](X)[lb==test_labels]
            result[(result < 0.0)*(lb==test_labels)] = to_analyze_filtered[lb==cluster_labels]['trip_duration'].median()
            print("Prediction for LB: ", lb)
        
        # ---- Finalize predictions
        result[to_analyze_test['dist'] < 0.1] = to_analyze_filtered[to_analyze_filtered['dist'] < 0.1]['trip_duration'].median()
        result[to_analyze_test['dist'] == 0] = to_analyze_filtered[to_analyze_filtered['dist']==0]['trip_duration'].median()
        return result, test_labels
    else:
        return clfs, cluster_labels


In [38]:
result, labels = clusterized_regressor(train_data, test_data) #.drop(['dropoff_datetime', 'trip_duration']

Evaluating the lb:  0
Prediction for LB:  0


In [39]:
result[:10]

array([  671.14418034,   651.72908929,   403.20230863,  1191.66157525,
         376.90323943,   911.1274353 ,  1018.56702824,   852.77143716,
        2457.64123499,   522.24756556])

In [40]:
train_data['trip_duration'][:10]



0     455
1     663
2    2124
3     429
4     435
5     443
6     341
7    1551
8     255
9    1225
Name: trip_duration, dtype: int64

In [41]:
labels[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [42]:
final = pd.DataFrame()

In [43]:
final['id'] = test_data['id']

In [44]:
final['trip_duration'] = result

In [45]:
final

Unnamed: 0,id,trip_duration
0,id3004672,671.144180
1,id3505355,651.729089
2,id1217141,403.202309
3,id2150126,1191.661575
4,id1598245,376.903239
5,id0668992,911.127435
6,id1765014,1018.567028
7,id0898117,852.771437
8,id3905224,2457.641235
9,id1543102,522.247566


In [46]:
final.to_csv('result.csv', index=False)