In [1]:
%matplotlib inline

In [2]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from collections import Counter
from matplotlib import pyplot as plt
import numpy as np
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor

In [3]:
import pandas as pd
train_data = pd.read_csv('train.csv', parse_dates=['pickup_datetime', 'dropoff_datetime'])
test_data = pd.read_csv('test.csv', parse_dates=['pickup_datetime'])

In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
id                    1458644 non-null object
vendor_id             1458644 non-null int64
pickup_datetime       1458644 non-null datetime64[ns]
dropoff_datetime      1458644 non-null datetime64[ns]
passenger_count       1458644 non-null int64
pickup_longitude      1458644 non-null float64
pickup_latitude       1458644 non-null float64
dropoff_longitude     1458644 non-null float64
dropoff_latitude      1458644 non-null float64
store_and_fwd_flag    1458644 non-null object
trip_duration         1458644 non-null int64
dtypes: datetime64[ns](2), float64(4), int64(3), object(2)
memory usage: 122.4+ MB


In [5]:
tocluster = train_data[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']].values

In [6]:
#scores = []
#for k in range(3, 16):
#    print("Current K is", k)
#    kmeans = KMeans(n_clusters=k, random_state=10)
#    labels = kmeans.fit(tocluster).labels_
#    scores.append(silhouette_score(tocluster, labels, metric='euclidean', sample_size=1000))
#plt.plot(range(3,16), scores)
#plt.show()

In [7]:
#kmeans = KMeans(n_clusters=8, random_state=10)

In [8]:
#cluster_labels = kmeans.fit(tocluster).labels_

In [9]:
#Counter(cluster_labels)

# Feature engeneering

Lets create some useful features from existing ones. At least, the following will be preferable:
1) distance between start and end points of the trip;
2) week of the day
3) month number
4) second of the current day

In [10]:
@np.vectorize
def distance_on_sphere(lat1, long1, lat2, long2):
    d2r = np.pi / 180.0
    phi1 = (90.0 - lat1) * d2r
    phi2 = (90.0 - lat2) * d2r
    # theta = долгота
    theta1 = long1 * d2r
    theta2 = long2 * d2r
    carc = (np.sin(phi1) * np.sin(phi2) * np.cos(theta1 - theta2) +
           np.cos(phi1) * np.cos(phi2))
    if np.abs(carc) > 1.0:
        return np.arccos(np.sign(carc) * 1.0)
    return np.arccos(carc)

In [11]:
def get_features(X):
    res = X.copy()
    distances = 6371 * distance_on_sphere(X[['pickup_latitude']].values,
                               X[['pickup_longitude']].values,
                               X[['dropoff_latitude']].values,
                               X[['dropoff_longitude']].values)

    res['dist'] = distances

    month = X['pickup_datetime'].map(lambda x: x.month)
    res['month'] = (month - month.min()) / (month.max() - month.min())
    
    wd = X['pickup_datetime'].map(lambda x: x.weekday())
    res['wdsin'] = np.sin(2 * np.pi * wd / 7.0)
    res['wdcos'] = np.cos(2 * np.pi * wd / 7.0)
    
    sc = X['pickup_datetime'].map(lambda x: x.hour * 3600.0 + x.minute * 60.0 + x.second)
    res['scsin'] = np.sin(2 * np.pi * sc / 86400.0)
    res['sccos'] = np.cos(2 * np.pi * sc / 86400.0)

    return res

# Detect and drop outliers

In [12]:
def filter_train_dataset(X): 
    strange_records1 = (X['dist'] < 0.1) & (X['trip_duration'] > 5000)
    strange_records2 = (X['dist'] < 100) & (X['trip_duration'] > 80000)
    return X.loc[~(strange_records1|strange_records2)]

In [13]:
to_analyze_filtered = filter_train_dataset(get_features(train_data))
y = to_analyze_filtered[['trip_duration']].values
X = to_analyze_filtered.drop(['store_and_fwd_flag','vendor_id', 'id', 'pickup_datetime', 'dropoff_datetime','trip_duration'], axis=1)


In [16]:
def clusterized_regressor(traindataset, testdataset=None):
    to_analyze_filtered = filter_train_dataset(get_features(traindataset))
    tocluster = to_analyze_filtered[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']].values
    kmeans = KMeans(n_clusters=8, random_state=10, n_jobs=-1)
    cluster_labels = kmeans.fit(tocluster).labels_
    clfs = {}
    y = to_analyze_filtered[['trip_duration']].values
    X = to_analyze_filtered.drop(['store_and_fwd_flag','vendor_id', 'id', 'pickup_datetime', 'dropoff_datetime','trip_duration'], axis=1).values
    for lb in np.unique(cluster_labels):
        if sum(cluster_labels==lb):
            clf = RandomForestRegressor(max_depth=5, random_state=0, n_jobs=-1, n_estimators=10)
            _X, _y = X[cluster_labels==lb,:], y[cluster_labels==lb]
            clf.fit(_X, _y.ravel())
            clfs.update({lb: clf.predict})
        else:
            clfs.update({lb: lambda x: to_analyze_filtered.loc[cluster_labels==lb, 'trip_duration'].median()})
        print("Evaluating the lb: ", lb)
    if testdataset is not None:
        to_analyze_test = get_features(testdataset)
        tocluster = testdataset[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']].values
        test_labels = kmeans.predict(tocluster)
        X = to_analyze_test.drop(['store_and_fwd_flag', 'vendor_id', 'id', 'pickup_datetime'], axis=1).values
        predictions = []
        for lb, data in zip(test_labels, X):
            value = clfs[lb]([data])
            predictions.append(value)
        to_analyze_test['pred'] = predictions
        return to_analyze_test
    else:
        return clfs


In [None]:
result = clusterized_regressor(train_data, test_data)

Evaluating the lb:  0
Evaluating the lb:  1
Evaluating the lb:  2
Evaluating the lb:  3
Evaluating the lb:  4
Evaluating the lb:  5
Evaluating the lb:  6
Evaluating the lb:  7
