# Imports

In [1]:
import numpy as np
import pandas as pd
import random
import os
import matplotlib.pyplot as plt
import matplotlib
import copy
import seaborn as sns
import geopy.distance

from numpy.linalg import inv
from sklearn.preprocessing import StandardScaler


import warnings
warnings.filterwarnings("ignore")

sns.set()

%config InlineBackend.figure_format = 'retina'

## Data

In [2]:
df = pd.read_csv("../input/uber-fares-dataset/uber.csv")
df.drop(["Unnamed: 0", "key"], axis=1, inplace=True)

y = "fare_amount"

In [3]:
df.sample(3)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
61778,11.9,2009-01-08 19:22:08 UTC,-73.965374,40.76917,-73.99087,40.750933,1
59695,6.5,2013-02-22 07:39:00 UTC,-73.99459,40.750482,-73.985448,40.752617,2
187592,4.5,2010-02-18 22:52:11 UTC,-74.006123,40.734959,-74.001835,40.727833,1


### Data preprocessing

In [4]:
df.dropna(inplace=True)
df.shape

(199999, 7)

In [5]:
df = df[(df.pickup_latitude<90) & (df.dropoff_latitude<90) &
        (df.pickup_latitude>-90) & (df.dropoff_latitude>-90) &
        (df.pickup_longitude<180) & (df.dropoff_longitude<180) &
        (df.pickup_longitude>-180) & (df.dropoff_longitude>-180)]

df.pickup_datetime=pd.to_datetime(df.pickup_datetime)

df['year'] = df.pickup_datetime.dt.year
df['month'] = df.pickup_datetime.dt.month
df['weekday'] = df.pickup_datetime.dt.weekday
df['hour'] = df.pickup_datetime.dt.hour

In [6]:
df.sample(3)

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,weekday,hour
125145,6.5,2015-04-22 18:15:09+00:00,-73.981293,40.737698,-73.985542,40.724461,2,2015,4,2,18
6408,21.3,2010-10-18 23:14:00+00:00,-73.989035,40.742393,-73.892928,40.750298,1,2010,10,0,23
162655,5.7,2010-04-09 13:27:55+00:00,-73.974442,40.783157,-73.958777,40.77503,1,2010,4,4,13


In [7]:
df['distance'] = 
[round(geopy.
       distance.distance(
           (df.pickup_latitude[i], 
            df.pickup_longitude[i]),
           (df.dropoff_latitude[i], df.dropoff_longitude[i])).m,2
      ) for i in df.index]

df.drop(['pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'], axis=1, inplace=True)

In [8]:
df.sample(3)

Unnamed: 0,fare_amount,passenger_count,year,month,weekday,hour,distance
101836,11.5,2,2012,11,4,16,2802.62
175911,6.1,1,2010,6,2,22,1844.26
60426,6.1,2,2010,1,6,11,1774.42


In [9]:
df.shape

(199987, 7)

### Save csv for scala project 

In [10]:
N_TRAIN = 100000
N_TEST = 20000

Firstly let's save 100 thousand for train and 20 thousand for test

In [11]:
df[:N_TRAIN].to_csv("uber_train.csv", header=False, index=False)

In [12]:
df[N_TRAIN:N_TRAIN+N_TEST].to_csv("uber_test.csv", header=False, index=False)

Check if everything is saved properly.

In [15]:
pd.read_csv("./uber_train.csv").sample(10)

Unnamed: 0,7.5,1,2015,5,3,19,1681.11
92274,9.3,1,2011,6,3,11,2495.06
13639,14.0,2,2012,10,1,10,4993.46
53417,6.9,1,2011,1,6,21,2036.83
74576,17.5,1,2015,3,4,14,4117.91
49378,12.5,1,2013,12,4,8,3497.76
6601,8.0,2,2014,5,5,17,2028.26
10938,9.3,1,2012,6,6,17,2513.08
36543,11.0,1,2013,4,4,9,1738.89
68451,6.9,1,2010,5,6,3,2161.47
51300,26.0,1,2015,4,5,16,7041.18


In [16]:
pd.read_csv("./uber_test.csv").sample(10)

Unnamed: 0,10.6,1,2009,1.1,1.2,21,0.0
17750,10.5,1,2015,3,3,8,1790.75
15068,16.1,5,2009,5,3,21,7202.19
4176,8.0,1,2015,6,2,21,1505.21
11641,10.5,1,2015,3,0,5,2741.6
16984,10.5,2,2013,6,3,21,1662.0
668,6.9,1,2011,6,5,19,1770.25
13176,9.0,5,2013,10,5,3,3488.71
3682,8.0,1,2015,5,6,8,2633.24
732,4.0,1,2014,2,0,1,573.4
674,9.7,2,2010,5,3,9,2504.37
