# Libraries

In [177]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import geopy as geo
from geopy.distance import vincenty as geods

%matplotlib inline 

# Check Data Files

In [1]:
data_root = "../input/"
train_data_path = data_root + "/train.csv"
test_data_path = data_root + "/test.csv"

In [124]:
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

In [125]:
train_df.head()

In [126]:
print("number of rows: ",train_df.count()[0])
print("number of cols: ",train_df.count(axis=1)[0])

#  Check nulls

In [127]:
np.sum(train_df.isnull())

# Feature Engineering

In [128]:
train_df['pickup_datetime'] = pd.to_datetime(train_df['pickup_datetime'])
train_df['dropoff_datetime'] = pd.to_datetime(train_df['dropoff_datetime'])

In [129]:
train_df['pickup_hour'] = train_df['pickup_datetime'].dt.hour
train_df['pickup_day'] = train_df['pickup_datetime'].dt.dayofweek
train_df['pickup_day_name'] = train_df['pickup_datetime'].dt.weekday_name
train_df['dropoff_day'] = train_df['dropoff_datetime'].dt.dayofweek
train_df['trip_week'] = train_df['dropoff_datetime'].dt.week
train_df['trip_month'] = train_df['dropoff_datetime'].dt.month
train_df['trip_year'] = train_df['dropoff_datetime'].dt.year

In [130]:
train_df['pickup_start_point'] =   train_df[['pickup_latitude','pickup_longitude']].apply(geo.Point,axis=1)

train_df['pickup_dropoff_point'] =  train_df[['dropoff_latitude','dropoff_longitude']].apply(geo.Point,axis=1)

train_df['raw_distance'] = train_df[['pickup_start_point','pickup_dropoff_point']].apply(lambda x: geods(x[0][:2],x[1][:2]).meters,axis=1)

TODO:
   - distance from google map

# Basic Checks and Stat

### data intervals 

In [131]:
print(train_df['trip_year'].min(),train_df['trip_year'].max())
print(train_df['trip_month'].min(),train_df['trip_month'].max())
print(train_df['pickup_hour'].min(),train_df['pickup_hour'].max())

In [132]:
train_df['raw_distance'].describe()

In [133]:
train_df['trip_duration'].describe()

# Let's start our EDA trip

## Check trip duration distribution 

In [134]:
sns.distplot(train_df['trip_duration'],hist=False)

hmm we have  extreme trip's durations

check if this durations is logical by plotting it against distance

In [135]:
sns.regplot(x="trip_duration", y="raw_distance", data=train_df,fit_reg=False)

also there is some illogical distances 

### Remove outliers

make a ratio  between distance and duration and remove extreme cases

the outliers here maybe are logical but we don't have all information to determine so

In [136]:
train_df['distance_duration_ratio'] = train_df['trip_duration'] / train_df['raw_distance']

In [137]:
lower_bound = train_df['distance_duration_ratio'].quantile(0.02)
upper_bound = train_df['distance_duration_ratio'].quantile(0.98)

In [138]:
train_df = train_df[train_df['distance_duration_ratio'] >= lower_bound]
train_df = train_df[train_df['distance_duration_ratio'] <= upper_bound]

In [139]:
sns.regplot(x="trip_duration", y="raw_distance", data=train_df,fit_reg=False)

we are kinda fine now

In [145]:
sns.distplot(train_df['trip_duration'])

a log will fix this i think

In [142]:
sns.distplot(np.log(train_df['trip_duration']))

a nice and fine normal distribution

## Check distance distribution

In [147]:
sns.distplot(train_df['raw_distance'])

In [148]:
sns.distplot(np.log(train_df['raw_distance']))

## Check Other Variables

In [159]:
sns.countplot(x=train_df['trip_month'])

In [222]:
sns.countplot(x=train_df['passenger_count'])

In [161]:
sns.countplot(x=train_df['trip_week'])

In [164]:
sns.countplot(x=train_df['pickup_day_name'])

In [172]:
sns.countplot(x=train_df['pickup_hour'])

In [165]:
sns.countplot(x=train_df['vendor_id'])

In [171]:
sns.countplot(x=train_df['store_and_fwd_flag'])
print('Y count:', np.sum(train_df['store_and_fwd_flag'] == 'Y'))

In [None]:
sns.countplot(x='pickup_day_name',hue='pickup_hour',data=train_df,ax = ax)

## bivariate analysis

In [188]:
_,ax = plt.subplots(1,1,figsize=(10,10))
sns.countplot(x='pickup_day_name',hue='pickup_hour',data=train_df,ax = ax)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

increase in demand in late hours in weekends

In [193]:
_,ax = plt.subplots(1,1,figsize=(10,10))
sns.countplot(x='trip_month',hue='pickup_day_name',data=train_df,ax = ax)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [209]:
sns.boxplot(x='vendor_id',y='trip_duration',data=train_df)
plt.ylim(0, 3000)

In [220]:
sns.boxplot(x='vendor_id',y='raw_distance',data=train_df)
plt.ylim(0, 10000)

both of them share the same time duration, raw distance

it will be easier and more visual appealing to deal with categorized trips

In [195]:
train_df['trip_duration_categorized'] = pd.qcut(train_df['trip_duration'],3,labels=['short','medium','long'])
train_df['trip_distance_categorized'] = pd.qcut(train_df['raw_distance'],3,labels=['short','medium','long'])

In [210]:
sns.boxplot(x='trip_duration_categorized',y='trip_duration',data=train_df,)
plt.ylim(0, 3000)

In [208]:
sns.boxplot(x='trip_duration_categorized',y='trip_duration',hue='trip_distance_categorized',data=train_df)
plt.ylim(0, 2000)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.,title='distance')

In [211]:
_,ax = plt.subplots(1,1,figsize=(10,10))
sns.countplot(x='trip_month',hue='trip_duration_categorized',data=train_df,ax = ax)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [214]:
_,ax = plt.subplots(1,1,figsize=(10,10))
sns.countplot(x='pickup_day_name',hue='trip_duration_categorized',data=train_df,ax = ax)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

nth here

In [215]:
_,ax = plt.subplots(1,1,figsize=(10,10))
sns.countplot(x='pickup_day_name',hue='trip_distance_categorized',data=train_df,ax = ax)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

also here

In [221]:
_,ax = plt.subplots(1,1,figsize=(10,10))
sns.countplot(x='passenger_count',hue='trip_distance_categorized',data=train_df,ax = ax)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

almost same here also

TODO:

- pretify figures
- add model
- more desciption ​​
- more feature enginering​​ on locations and more EDA trip

Waiting for your feedback and hope you like it

