In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv("../input/nyc-taxi-trip-duration/train.zip")
test = pd.read_csv("../input/nyc-taxi-trip-duration/test.zip")

In [None]:
# Количество отсутствующих данных
print(f"na sum of train: {train.isna().sum()}")
print(f"na sum of test: {test.isna().sum()}")

In [None]:
# Разбиваем Datetime на более подробные данные
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])

train['pickup_hour'] = [d.hour for d in train['pickup_datetime']]
train['pickup_week'] = [d.week for d in train['pickup_datetime']]
train['pickup_month'] = [d.month for d in train['pickup_datetime']]
train['pickup_weekdays'] = [d.weekday() for d in train['pickup_datetime']]
train['pickup_weekend'] = ["Not-weekend" for d in train['pickup_datetime']]
train.loc[train['pickup_weekdays'].isin([5, 6]), 'pickup_weekend'] = "Weekend"

train

In [None]:
# так же записываем день месяца/года
train['mday'] = [d.day for d in train['pickup_datetime']]
train['yday'] = [d.dayofyear for d in train['pickup_datetime']]

train

In [None]:
# Расчитываем расстояние поездки
from pyproj import Geod

wgs84_geod = Geod(ellps='WGS84') #Distance will be measured on this ellipsoid - more accurate than a spherical method


def Distance(lat1, lon1, lat2, lon2):
    az12,az21,dist = wgs84_geod.inv(lon1,lat1,lon2,lat2) #Yes, this order is correct
    return dist

    
pickups = [train['pickup_longitude'], train['pickup_latitude']]
dropoffs = [train['dropoff_longitude'], train['dropoff_latitude']]
train['distance_m'] = Distance(train['pickup_latitude'].tolist(), train['pickup_longitude'].tolist(), train['dropoff_latitude'].tolist(), train['dropoff_longitude'].tolist())
train['distance_km'] = train['distance_m'] / 1000

train['distance_km']

In [None]:
# График кол-ва поездок в зависимости от расстояния
from plotnine import *
distance_plot = (
    ggplot(train) +
    geom_histogram(aes(x = 'distance_km'), bins=4000, fill="red") +
    labs(
        y = 'Amount',
        x = 'Distance in km'
    ) +
    coord_cartesian([0, 60])
)

distance_plot

In [None]:
# Средняя скорость поездки
train['speed'] = train['distance_km'] / (train['trip_duration'] / 3600)
train['speed']

In [None]:
# График кол-ва поездок по скорости
speed_plot = (
    ggplot(train) +
    geom_histogram(aes(x = 'speed'), bins=4000, fill="red") +
    labs(
        y = 'Amount',
        x = 'Speed in km'
    ) +
    coord_cartesian([0, 60])
)

speed_plot

In [None]:
# Описание по столбцу "speed"
desc = train['speed'].describe()
pd.set_option('float_format', '{:f}'.format)
desc

In [None]:
# Посмотрим как скорость зависит от времени дня
tmp = train.groupby('pickup_hour').agg({'speed': ['mean', 'sum']})

tmp

fig = (
    ggplot(aes(x=tmp.index,y=tmp['speed']['mean']))+
    geom_point(color="red")+coord_cartesian(ylim=[10,25])+theme_bw()
)

fig

In [None]:
# Проверим данные на взаимосвязь
corr_features = train[['pickup_hour', 'pickup_week', 'pickup_month', 'yday', 'mday', 'passenger_count', 'trip_duration', 'distance_km']]
corr_features

In [None]:
# Длительность поездки почти не зависит от какой-либо другой переменной
import seaborn as sns

corr= corr_features.corr()
sns.heatmap(corr, annot=True, fmt='.2f')

In [None]:
# Посмотрим как меняется длина и продолжительность поездки в зависимости от времени дня

tmp = train.groupby('pickup_hour').agg({'trip_duration': ['mean', 'count']})

figDuration = (
    ggplot(aes(x=tmp.index,y=tmp['trip_duration']['mean']))+
    geom_point(color="red")+coord_cartesian(ylim=[500,1500])+theme_bw()
)

tmp = train.groupby('pickup_hour').agg({'distance_km': ['mean', 'count']})

figDistance = (
    ggplot(aes(x=tmp.index,y=tmp['distance_km']['mean']))+
    geom_point(color="red")+coord_cartesian(ylim=[0,10])+theme_bw()
)

In [None]:
figDuration

In [None]:
figDistance

In [None]:
# Проверка данных, которые являются корректными при соотношении времени поездки / расстояния
dim = train
dim['distance_m'] = train['distance_km'] / 1000

In [None]:
fig = (
    ggplot(aes(dim['distance_m'], dim['trip_duration'])) +
    geom_point() + 
    scale_x_log10() +
    scale_y_log10() +
    labs(x = "Direct distance [m]", y = "Trip duration [s]")
)

fig

In [None]:
# Удалим поездки с проеханным расстоянием в 0
zero_km = train.loc[train['distance_km'] == 0]
zero_km = zero_km[['dropoff_latitude', 'dropoff_longitude', 'trip_duration']]
zero_km

In [None]:
# Уберем поездки с проеханным расстоянием в 0
train = train.loc[train['distance_km'] != 0]
train

In [None]:
# Обновим данные
tmp = train.groupby('pickup_hour').agg({'trip_duration': ['mean', 'count']})

figDuration = (
    ggplot(aes(x=tmp.index,y=tmp['trip_duration']['mean']))+
    geom_point(color="red")+coord_cartesian(ylim=[500,1250])+theme_bw()+
    labs(x = "Time of day", y = "Trip duration [s]")
)

tmp = train.groupby('pickup_hour').agg({'distance_km': ['mean', 'count']})

figDistance = (
    ggplot(aes(x=tmp.index,y=tmp['distance_km']['mean']))+
    geom_point(color="red")+coord_cartesian(ylim=[0,7])+theme_bw()+
    labs(x = "Time of day", y = "Direct distance [km]")
)

In [None]:
figDuration

In [None]:
figDistance

In [None]:
# Подготовка тестового набора
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])
test['pickup_hour'] = [d.hour for d in test['pickup_datetime']]

pickups = [test['pickup_longitude'], test['pickup_latitude']]
dropoffs = [test['dropoff_longitude'], test['dropoff_latitude']]
test['distance_m'] = Distance(test['pickup_latitude'].tolist(), test['pickup_longitude'].tolist(), test['dropoff_latitude'].tolist(), test['dropoff_longitude'].tolist())
test['distance_km'] = test['distance_m'] / 1000
test

In [None]:
from sklearn.linear_model import LinearRegression
# Используя множественную регрессию
lr = LinearRegression()
x = train[['pickup_hour', 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']]
y = train[['trip_duration']]
# Обучение модели
lr.fit(x, y)

In [None]:
# Проверим тестовый набор данных
x = test[['pickup_hour', 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']].copy()
x = np.array(x.values)
print(x)
y = pd.Series(np.array(lr.predict(x)).ravel())

# Для проверки выведем среднее значение
print(y.mean())

In [None]:
# Выведем полученные коэффициенты 
lr.coef_