In [None]:
# Initial Python environment setup...
import numpy as np # linear algebra
import pandas as pd # CSV file I/O (e.g. pd.read_csv)
import os # reading the input files we have access to
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

print(os.listdir('../input'))
print(os.listdir('../working'))
print(os.listdir('..'))

### Setup training data
First let's read in our training data.  Kernels do not yet support enough memory to load the whole dataset at once, at least using `pd.read_csv`.  The entire dataset is about 55M rows, so we're skipping a good portion of the data, but it's certainly possible to build a model using all the data.

In [None]:
train_df =  pd.read_csv('../input/train.csv', nrows = 13_000_000)

train_df.dtypes
train_df.head(20)
train_df.info()

test_df =  pd.read_csv('../input/test.csv')
test_df.dtypes
test_df.head(20)
test_df.info()




In [None]:
def prepare_time_features(df):
    #df['pickup_datetime_dt'] = df['pickup_datetime'].str.replace(" UTC", "")
    df['pickup_datetime_dt'] = pd.to_datetime(df['pickup_datetime'], format='%Y-%m-%d %H:%M:%S UTC')
    df['hour_of_day'] = df.pickup_datetime_dt.dt.hour
    #df['week'] = df.pickup_datetime_dt.dt.week
    df['month'] = df.pickup_datetime_dt.dt.month
    df["year"] = df.pickup_datetime_dt.dt.year
    df['day_of_year'] = df.pickup_datetime_dt.dt.dayofyear
    #df['week_of_year'] = df.pickup_datetime_dt.dt.weekofyear
    #df["weekday"] = df.pickup_datetime.dt.weekday
    #df["quarter"] = df.pickup_datetime_dt.dt.quarter
    #df["day_of_month"] = df.pickup_datetime_dt.dt.day
    df["day_of_week"] = df.pickup_datetime_dt.dt.dayofweek
    
    
    return df

train_df = prepare_time_features(train_df)
train_df["is_weekend"] = 0

train_df.loc[train_df['day_of_week']==6, ['is_weekend']] = 1
train_df.loc[train_df['day_of_week']==5, ['is_weekend']] = 1

train_df["passenger_count_class"] = 0
train_df.loc[train_df['passenger_count']>=5, ['passenger_count_class']] = 1
train_df[train_df.passenger_count_class==1].head(20)

test_df = prepare_time_features(test_df)
test_df["is_weekend"] = 0
test_df.loc[test_df['day_of_week']==6, ['is_weekend']] = 1
test_df.loc[test_df['day_of_week']==5, ['is_weekend']] = 1

test_df["passenger_count_class"] = 0
test_df.loc[test_df['passenger_count']>=5, ['passenger_count_class']] = 1
test_df[test_df.passenger_count_class==1].head(20)


In [None]:
def add_travel_vector_features(df):
    degree_len_long =  84.1 #km 41 degree
    degree_len_lat = 111.1  #km
    df['abs_diff_longitude'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['abs_diff_latitude'] = (df.dropoff_latitude - df.pickup_latitude).abs()
    df['d'] = np.sqrt((df.abs_diff_longitude*degree_len_long)**2 + (df.abs_diff_latitude*degree_len_lat)**2)
    df['d_long'] = df.abs_diff_longitude*degree_len_long
    df['d_lat'] = df.abs_diff_latitude*degree_len_lat

add_travel_vector_features(train_df)
add_travel_vector_features(test_df)

    

In [None]:
import datetime
# Cleaning train data
print('Old size: %d' % len(train_df))
train_df = train_df[(train_df.abs_diff_longitude < 5.0) & (train_df.abs_diff_latitude < 5.0)]
train_df = train_df.dropna(how = 'any', axis = 'rows')
train_df = train_df[train_df.pickup_longitude<0]
train_df = train_df[train_df.pickup_latitude >0]
train_df = train_df[train_df.dropoff_longitude<0]
train_df = train_df[train_df.dropoff_latitude>0]
train_df = train_df[train_df.d>=0.01]
#train_df = train_df[train_df.fare_amount < 30]

print('New size: %d' % len(train_df))


train_fare_amount_mean = train_df['fare_amount'].mean()
    

#we are just searhing similar routes in the train database

def search_fare(test_df,row_num,train_df,delta=0.0,delta_step=0.001,delta_hour=2,row_count_min=1,fare_amount_max=30):

    degree_len_long =  84.1 #km 41 degree
    degree_len_lat = 111.1 #km
    
    row_count = 0

    df_row = test_df.iloc[row_num]
    p_long = df_row.pickup_longitude
    p_lat =  df_row.pickup_latitude
    d_long = df_row.dropoff_longitude
    d_lat =  df_row.dropoff_latitude
    hour_of_day = df_row.hour_of_day
    year = df_row.year
    month = df_row.month
    is_weekend = df_row.is_weekend
    day_of_week = df_row.day_of_week
    test_key = df_row.key
    passenger_count = df_row.passenger_count
    passenger_count_class = df_row.passenger_count_class
    d = df_row.d

    i_step = 0
    while ((row_count < row_count_min)&(i_step<10)) :
        delta = delta+delta_step
        i_step+=1

        df_result = train_df[(abs(train_df.pickup_longitude - p_long)*degree_len_long<=delta) & (abs(train_df.pickup_latitude - p_lat)*degree_len_lat<=delta)
                         & (abs(train_df.dropoff_longitude - d_long)*degree_len_long<=delta) & (abs(train_df.dropoff_latitude - d_lat)*degree_len_lat<=delta) & (train_df.key !=test_key) 
                         & (train_df.year ==year) &  (train_df.month == month)] 
                    
        row_count = df_result.key.count()
        if (row_count > 0):
            
            fare_amount_mean = df_result.fare_amount.mean()
            fare_amount_pred = fare_amount_mean
        else:
            fare_amount_mean = train_fare_amount_mean
            fare_amount_pred = fare_amount_mean
    

    return fare_amount_pred, delta, row_count, fare_amount_mean, df_result
    

def get_my_score(y_pred,y_real):
    my_sum =0.0
    for j in range (0,y_pred.size):
         #print (j)
         my_sum += (y_pred.iat[j] - y_real.iat[j])**2
         #print (np.log(Y_pred.iat[j]),np.log(Y_real.iat[j]),my_sum)
         #my_sum = np.sqrt(my_sum)/Y_pred.size
         #my_sum = np.sqrt(my_sum)
         #my_sum+= (Y_pred[j] - Y_real[j])**2,
    my_sum = np.sqrt(my_sum/y_pred.size)  
    return my_sum





def get_prediction(test_df, train_df, row_num = 0, delta_step = 0.006, delta_hour=2, row_count_min=1,fare_amount_max=30):
    if (row_num ==0):
        row_num = test_df.shape[0]
    print(row_num)
    print(delta_step)
    sub_df = pd.DataFrame()
    #row_num = 500
    #delta_step = 0.0008
    #delta_step = 0.004
    sub_df['key'] = test_df.iloc[0:row_num]['key']
    sub_df['fare_amount'] = 0
    
    #sub_df['fare_amount_mean'] = 0
    if (test_df.columns[1]=='fare_amount'):
        sub_df['fare_amount_train'] = test_df['fare_amount']
    
        sub_df['row_count'] = 0
        sub_df['delta'] = 0
    
    print(datetime.datetime.now())
    d1 = datetime.datetime.now()
    for i in range(test_df.iloc[0:row_num].shape[0]) :
    #for i in range(1000) :
        #print('--------------')
        if (i%100 ==0): 
            print(i)
        fare_amount_pred, delta, row_count,fare_amount_mean, df_result = search_fare(test_df,i,train_df,delta=0.0, 
                             delta_step=delta_step,delta_hour=delta_hour,row_count_min=row_count_min,
                             fare_amount_max = fare_amount_max)
        #print('fare_amount_pred=' +str(fare_amount_pred))
        sub_df['fare_amount'].iloc[i]=round(fare_amount_pred,2)
        #sub_df['fare_amount_mean'].iloc[i]=fare_amount_mean
        if (test_df.columns[1]=='fare_amount'): 
            sub_df['row_count'].iloc[i]=row_count
            sub_df['delta'].iloc[i]=delta
    
    #print (sub_df.info())
    print (sub_df.shape)
    sub_df.to_csv('submission.csv', index = False)
    
    d2 = datetime.datetime.now()
    print(d2-d1)
    #print(datetime.datetime.now())
    print('row_num=' + str(row_num))
    print('delta_step=' + str(delta_step))
    print('delta_hour=' + str(delta_hour))
    print('row_count_min=' + str(row_count_min))
    print('fare_amount_max=' + str(fare_amount_max))

    #print(get_my_score(sub_df['fare_amount_'],sub_df['fare_amount']))
    if (test_df.columns[1]=='fare_amount'):
        print(get_my_score(test_df.iloc[0:row_num]['fare_amount'],sub_df['fare_amount']))
    
    return sub_df


#this cycle is for testing...
#for submission is does not work!!

i_count = 0
mean_score = 0
for i in range(100,100):

    print(i)
    i_count+= 1
    #X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_train_1, y_train, test_size=0.1, random_state=i)
    
    y_train = train_df['fare_amount']
    X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(train_df, y_train, test_size=0.00002, random_state=i)
    X_train_d = pd.DataFrame()
    print(X_test_d.shape)
    sub_df= get_prediction(test_df=X_test_d, train_df=train_df, row_num = 0, delta_step = 0.6,
                           row_count_min=10, fare_amount_max=1000)
    run_score = get_my_score(X_test_d['fare_amount'],sub_df['fare_amount'])
    print('run_score=' + str(run_score))
    mean_score+= run_score
if (i_count>0):   
    print('mean_total_score=' + str(mean_score/i_count))
    sub_df['fare_error'] = abs(sub_df['fare_amount'] - sub_df['fare_amount_train'])
    sub_df.sort_values(by=['fare_error']).tail(20)



sub_df= get_prediction(test_df=test_df, train_df=train_df, row_num = 0, delta_step = 0.6, row_count_min=1)

sub_df.to_csv('submission01.csv', index = False)