In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id="top"></a>
<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" role="tab" aria-controls="home">Table of Content</h3>
    
   * [1. Data Exploration](#1)
   * [2. Destination (Starting and Stopping)](#2)
   * [3. Exploring date and time object](#3)
   * [4. Category and Purpose](#4)
   * [5. Conclusion](#5)


<a id="1"></a>
<font color="darkslateblue" size=+2.5><b>1. Data Exploration </b></font>
<a href="#top" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover">Go to TOC</a>


In [None]:
uber_df = pd.read_csv("/kaggle/input/uberdrives/My Uber Drives - 2016.csv")

In [None]:
# First 5 records 
uber_df.head()

In [None]:
# Last 5 records 
uber_df.tail()

In [None]:
# The  shape and size of data 
print(uber_df.shape)
print (uber_df.size)

* The dataset has 1156 rows and 7 columns . The total size of the dataset is 8092

In [None]:
# Columns names 
uber_df.columns

In [None]:
# Data  type of the columns 

uber_df.dtypes

In [None]:
#get more information about data
uber_df.info()

**The dataset has 1 numerical variable and 6 categorical variables** <br>
**The "PURPOSE" column has many missing values**

In [None]:
# Check the missing values 
uber_df.isnull().any()


In [None]:
#Get the number of missing values in each column
uber_df.isnull().sum()

**From the above it is clear that is there is one null data which needs**

In [None]:
uber_df[uber_df['END_DATE*'].isnull()]

In [None]:
uber_df.drop(uber_df.index[1155],inplace=True)


In [None]:
#Duplicated Records needs to be removed 

uber_df[uber_df.duplicated()]

In [None]:
# Dropping the duplicates values 

uber_df.drop_duplicates(inplace=True)


In [None]:
# Get the initial data with dropping the NA values
uber_df = uber_df.dropna()

#Get the shape of the dataframe after removing the null values
uber_df.shape

**The dataset now contains 653 rows of non-null values**

In [None]:
#get the summary of data
uber_df.describe().T

- This miles column is rightly skewed ( as we have mean > median)
- The ride ranges from 0.5 miles 310.3 miles

In [None]:
# Displot of Miles 
plt.figure(figsize=(10,6))
sns.distplot(uber_df["MILES*"])
plt.show()

<a id="2"></a>
<font color="darkslateblue" size=+2.5><b>2. Destination(Starting and Stopping)</b></font>

<a href="#top" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover">Go to TOC</a>



In [None]:
# Get the starting destination, unique destination
print(uber_df['START*'].unique()) #names of unique start points
print(len(uber_df['START*'].unique())) #count of unique start points

**There are 131 unique starting points in the dataset**

In [None]:
# Get the starting destination, unique destination
print(uber_df['STOP*'].unique()) #names of unique start points
print(len(uber_df['STOP*'].unique())) #count of unique start points

**There are 137 unique destinations in the datset**

In [None]:
#Identify popular start destinations - top 10
uber_df['START*'].value_counts().head(10)

In [None]:
plt.figure(figsize=(15,10))
sns.countplot(y="START*",order= pd.value_counts(uber_df['START*']).iloc[:10].index, data=uber_df)
plt.show()

**We can say that Cary is most poplular starting point for this driver.**

In [None]:
#Identify popular stop destinations - top 10
uber_df['STOP*'].value_counts().head(10)

In [None]:
plt.figure(figsize=(15,10))
sns.countplot(y="STOP*",order= pd.value_counts(uber_df['STOP*']).iloc[:10].index, data=uber_df)
plt.show()

**Cary also features in the most popular stop destinations**

In [None]:
#Find out most farthest start and stop pair -top10
#Dropping Unknown Location Value
uber_df2 = uber_df[uber_df['START*']!= 'Unknown Location']
uber_df2 = uber_df2[uber_df2['STOP*']!= 'Unknown Location']

uber_df2.groupby(['START*','STOP*'])['MILES*'].sum().sort_values(ascending=False).head(10)

**Cary and Durham are the farthest from each other**

In [None]:
#Find out most popular start and stop pair - top10
uber_df2.groupby(['START*','STOP*']).size().sort_values(ascending=False).head(10)

**Cary and Durham are the farthest from each other**

In [None]:
# For this purpose, we need to make a function
plt.figure(figsize=(8,5))
def round(x):
    if x['START*'] == x['STOP*']:
        return 'YES'
    else:
        return 'NO'
    
uber_df['ROUND_TRIP*'] = uber_df.apply(round, axis=1)

sns.countplot(uber_df['ROUND_TRIP*'], order=uber_df['ROUND_TRIP*'].value_counts().index, palette='viridis')
plt.show()

**No of round trips is less**

<a id="3"></a>
<font color="darkslateblue" size=+2.5><b>3.Exploring date and time object</b></font>

<a href="#top" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover">Go to TOC</a>


In [None]:
# Convert the START DATE and END_DATE in string format to datetime object

uber_df.loc[:, 'START_DATE*'] = uber_df['START_DATE*'].apply(lambda x: pd.datetime.strptime(x, '%m/%d/%Y %H:%M'))
uber_df.loc[:, 'END_DATE*'] = uber_df['END_DATE*'].apply(lambda x: pd.datetime.strptime(x, '%m/%d/%Y %H:%M'))

In [None]:
#Calculate the duration for the rides
uber_df['DIFF'] = uber_df['END_DATE*'] - uber_df['START_DATE*']

In [None]:
#convert duration to numbers(minutes)
uber_df.loc[:, 'DIFF'] = uber_df['DIFF'].apply(lambda x: pd.Timedelta.to_pytimedelta(x).days/(24*60) + pd.Timedelta.to_pytimedelta(x).seconds/60)

In [None]:
uber_df['DIFF'].head()

In [None]:
uber_df['DIFF'].describe()

**Ride durations range from 2 minutes to 330 minutes with an average duration of 23 minutes**

In [None]:
#Capture Hour, Day, Month and Year of Ride in a separate column
uber_df['month'] = pd.to_datetime(uber_df['START_DATE*']).dt.month
uber_df['Year'] = pd.to_datetime(uber_df['START_DATE*']).dt.year
uber_df['Day'] = pd.to_datetime(uber_df['START_DATE*']).dt.day
uber_df['Hour'] = pd.to_datetime(uber_df['START_DATE*']).dt.hour

In [None]:
#Capture day of week and rename to weekday names
uber_df['day_of_week'] = pd.to_datetime(uber_df['START_DATE*']).dt.dayofweek
days = {0:'Mon',1:'Tue',2:'Wed',3:'Thur',4:'Fri',5:'Sat',6:'Sun'}

uber_df['day_of_week'] = uber_df['day_of_week'].apply(lambda x: days[x])

In [None]:
#Rename the numbers in the Month column to calendar months
import calendar
uber_df['month'] = uber_df['month'].apply(lambda x: calendar.month_abbr[x])
uber_df.head()

In [None]:
#Extract the total number of trips per month, weekday
print(uber_df['month'].value_counts())
print(uber_df['day_of_week'].value_counts())

In [None]:
plt.figure(figsize=(12,7))
sns.countplot(uber_df['month'],order=pd.value_counts(uber_df['month']).index)
plt.show()

In [None]:
plt.figure(figsize=(12,7))
sns.countplot(uber_df['day_of_week'],order=pd.value_counts(uber_df['day_of_week']).index)
plt.show()

In [None]:
#Getting the average distance covered per month
uber_df.groupby('month').mean()['MILES*'].sort_values(ascending = False)

In [None]:
plt.figure(figsize=(12,5))
uber_df.groupby('month').mean()['MILES*'].sort_values(ascending = False).plot.bar(color=['maroon','darksalmon','green','blue','yellow','teal'])
plt.axhline(uber_df['MILES*'].mean(), linestyle='--', color='green', label='Mean distance')
plt.legend()
plt.show()

**Longest average distance is covered in Oct and least in Dec**

In [None]:
#Number of trips based of hour of day
uber_df['Hour'].value_counts()

In [None]:
plt.figure(figsize=(12,7))
sns.countplot(uber_df['Hour'],order=pd.value_counts(uber_df['Hour']).index)
plt.show()

**Afternoons and evenings seem to have the maximum number of trips**

In [None]:
#which hours of the day highest trips are made

plt.figure(figsize=(20,10))
sns.countplot(data=uber_df, x=uber_df['Hour'], hue=uber_df['day_of_week'],
              palette="Set1", )
plt.show()

In [None]:
# Day Time or Night time 
a = pd.to_datetime(['18:00:00']).time
uber_df['DAY/NIGHT*'] = uber_df.apply(lambda x : 'Night Ride' if x['START_DATE*'].time() > a else 'Day Ride', axis=1)
sns.countplot(uber_df['DAY/NIGHT*'], palette='Set2' , order = uber_df['DAY/NIGHT*'].value_counts().index)
plt.show()

**Maximum trips were made during Day Time**


In [None]:
plt.figure(figsize=(9,7))
sns.countplot(uber_df['day_of_week'], hue=uber_df['DAY/NIGHT*'], palette='Set1' ,
              order=uber_df['day_of_week'].value_counts().index)
plt.show()

**Highest number of Day rides were on Friday while Night rides were on Sunday/Thursday**


In [None]:
m = {}

for i in uber_df['MILES*']:
    for i in uber_df['MILES*']:
        if i < 10:
            m.setdefault(i,'0-10 miles')
        elif i >= 10 and i < 20:
            m.setdefault(i,'10-20 miles')
        elif i >= 20 and i < 30:
            m.setdefault(i,'20-30 miles')
        elif i >= 30 and i < 40:
            m.setdefault(i,'30-40 miles')
        elif i >= 40 and i < 50:
            m.setdefault(i,'40-50 miles')
        else:
            m.setdefault(i,'Above 50 miles')
            
uber_df['MILES_BUCKET*'] = uber_df['MILES*'].map(m)

plt.figure(figsize=(10,6))
sns.countplot(uber_df['MILES_BUCKET*'], palette='Set1' ,
              order = uber_df['MILES_BUCKET*'].value_counts().index)
plt.show()

**One can see that there are more number of short distanced trips**


In [None]:
plt.figure(figsize=(10,7))
sns.countplot(uber_df['MILES_BUCKET*'], hue=uber_df['DAY/NIGHT*'], palette='cubehelix',
              order = uber_df['MILES_BUCKET*'].value_counts().index)
plt.show()

**Both highest number of Day and Night Rides were below 10miles of a distance**

In [None]:
# calculate trip speed for each trip
uber_df['Duration_hours'] = uber_df['DIFF'] / 60
uber_df['Speed_KM'] = uber_df['MILES*'] / uber_df['Duration_hours']
uber_df['Speed_KM'].describe()

In [None]:
plt.figure(figsize=(12,7))
sns.distplot(uber_df['Speed_KM'])

**The average speed is little right skewed here**

<a id="4"></a>
<font color="darkslateblue" size=+2.5><b>4.Category and Purpose </b></font>

<a href="#top" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover">Go to TOC</a>


In [None]:
uber_df['CATEGORY*'].value_counts()

**Most trips are in the business category**

In [None]:
plt.figure(figsize=(8,6))
sns.countplot(uber_df['CATEGORY*'])
plt.show()

In [None]:
#Purpose
uber_df['PURPOSE*'].value_counts()

**Most trips are for meetings**

In [None]:
plt.figure(figsize=(15,6))
sns.countplot(uber_df['PURPOSE*'], order=uber_df['PURPOSE*'].value_counts().index, palette='Set2')
plt.show()

In [None]:
#Average distance traveled for each activity
uber_df.groupby('PURPOSE*').mean()['MILES*'].sort_values(ascending = False)

In [None]:
# How many miles was earned per category and purpose ?
uber_df.groupby('PURPOSE*').sum()['MILES*'].sort_values(ascending = False)

In [None]:
#How many miles was earned per category and purpose ?
uber_df.groupby('CATEGORY*').sum()['MILES*'].sort_values(ascending = False)

In [None]:
#Categorywise trip per day
plt.figure(figsize=(9,5))
a=sns.countplot(uber_df['day_of_week'],hue=uber_df['CATEGORY*'],palette=["#FF3333" ,"#00CC00"],
                order=uber_df['day_of_week'].value_counts().index)
plt.legend(bbox_to_anchor=(1.05, 0.95), loc=2)
plt.show()

In [None]:
# Monthly Ride based on category
plt.figure(figsize=(9,5))
sns.countplot(uber_df['month'],hue=uber_df['CATEGORY*'],palette="Set1",
              order=uber_df['month'].value_counts().index)
plt.show()

<a id="5"></a>
<font color="darkslateblue" size=+2.5><b>5. Conclusion</b></font>

<a href="#top" class="btn btn-primary btn-sm" role="button" aria-pressed="true" style="color:white" data-toggle="popover">Go to TOC</a>


- The miles driven ranges from 0.5 miles to 12204 miles with an average of 21 miles
- There are 131 unique start destinations in the dataset
- There are 137 unique start destinations in the dataset
- Cary is most poplular starting point for this driver.
- Cary also features in the most popular stop destinations
- Cary and Durham are the farthest from each other
- The most popular start to destination pair is Cary-Morrisville
- Ride durations range from 2 minutes to 330 minutes with an average duration of 23 minutes
- December has maximum number of trips and August has the least
- Friday has the maximum number of trips
- Longest average distance is covered in Oct and least in Dec
- Afternoons and evenings seem to have the maximum number of trips
- Most trips are in the business category
- Most trips are for meetings
- There are more number of short distanced trips
- There are more number of day trips
- Both highest number of Day and Night Rides were below 10miles of a distance
- Highest number of Day rides were on Friday while Night rides were on Sunday/Thursday