In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/uberdrives/My Uber Drives - 2016.csv')

### First need to understand the data

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

#### Above info shows the data types

In [None]:
data.isnull().sum()

#### From above data it is clear that there is that one null data which helps in further analysis if it is removed

In [None]:
data[data['END_DATE*'].isnull()]

In [None]:
data.drop(data.index[1155],inplace=True)

### Need to find the duplicated data and should be removed

In [None]:
data[data.duplicated()]

In [None]:
data.drop_duplicates(inplace=True)

In [None]:
data.isnull().sum()

### There we have 4 rows having starting and end-time exactly equal i.e. zero trip time, while having non-zero miles (distance) - which cannot be possible.

In [None]:
data[data['START_DATE*'] == data['END_DATE*']]

#### Removing them from our dataset

In [None]:
data.drop(data.index[[751, 761, 798, 807]], inplace=True)

In [None]:
data.shape

### Converting 'START_DATE*' and 'END_DATE*' to Datetime format

In [None]:
data['START_DATE*'] = pd.to_datetime(data['START_DATE*'])
data['END_DATE*'] = pd.to_datetime(data['END_DATE*'])

### Plotting Business vs Personal Trips

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(data['CATEGORY*'])
plt.show()

### Plotting the Frequency for the Purpose of Trip

In [None]:
plt.figure(figsize=(15,6))
sns.countplot(data['PURPOSE*'], order=data['PURPOSE*'].value_counts().index, palette='viridis')
plt.show()

### Checking how many are the ROUND TRIPS i.e. start and stop points are same

In [None]:
# For this purpose, we need to make a function
plt.figure(figsize=(8,5))
def round(x):
    if x['START*'] == x['STOP*']:
        return 'YES'
    else:
        return 'NO'
    
data['ROUND_TRIP*'] = data.apply(round, axis=1)

sns.countplot(data['ROUND_TRIP*'], order=data['ROUND_TRIP*'].value_counts().index, palette='viridis')
plt.show()

#### There are lesser number of round trips

### Entering Month Name and evaluating Frequency of Trip in each Month 

In [None]:
data['MONTH*'] = pd.DatetimeIndex(data['START_DATE*']).month

In [None]:
dic = {1:'Jan', 2: 'Feb', 3: 'Mar', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'Aug', 9: 'Sep',
      10: 'Oct', 11: 'Nov', 12: 'Dec' }

data['MONTH*'] = data['MONTH*'].map(dic)

In [None]:
plt.figure(figsize=(12,7))
sns.countplot(data['MONTH*'], order=data['MONTH*'].value_counts().index, palette='magma')
plt.axhline(data['MONTH*'].value_counts().mean(),linestyle='--', color = 'darkred', label='Mean Trips across Months')
plt.legend()
plt.show()

#### Above plot shows that December has highest number of trips

### Round Trip against Months

In [None]:
plt.figure(figsize=(12,7))
a=sns.countplot(data['ROUND_TRIP*'],hue=data['MONTH*'])
plt.legend(bbox_to_anchor=(1.05, 0.95), loc=2)
plt.show()

### Mean distance of Trip w.r.t Category

In [None]:
plt.figure(figsize=(8,5))
data.groupby('CATEGORY*')['MILES*'].mean().plot.bar(color=['maroon','darksalmon'])
plt.axhline(data['MILES*'].mean(), linestyle='--', color='green', label='Mean distance')
plt.legend()
plt.show()

### Now we need to know which number of hours has highest trips

In [None]:
plt.figure(figsize=(15,8))
sns.countplot(data['START_DATE*'].dt.hour, palette='cubehelix',
              order = data['START_DATE*'].dt.hour.value_counts().index)
plt.show()

#### From above graph it is found that higest number of trips occur between 12p.m - 3p.m, among which 3pm has highest trips

### Now we want to determine the speed for each trip

#### For that we will first find the trip duration and then speed

In [None]:
def timings(x):
    seconds = (x['END_DATE*'] - x['START_DATE*']).seconds
    return seconds

data['TRIP_HOURS*'] = (data.apply(timings, axis=1)/3600).round(decimals=2)

In [None]:
def speed(x):
    if x['START_DATE*'] != x['END_DATE*']:
        sp = x['MILES*']*1.61 / x['TRIP_HOURS*']
        return sp
data['SPEED_KM/HR*'] = data.apply(speed, axis=1).round(decimals=2)

In [None]:
data['SPEED_KM/HR*']

### Now we are interested in getting to know which day of week has highest number of trips

In [None]:
def day(x):
    day = x['START_DATE*'].day_name()
    return day

data['DAY*'] = data.apply(day, axis=1)

In [None]:
plt.figure(figsize=(7,5))
sns.countplot(data['DAY*'], order = data['DAY*'].value_counts().index)
plt.show()

#### Above figure shows that highest trips were made on Friday

### Category wise trip per day

In [None]:
plt.figure(figsize=(9,6))
a=sns.countplot(data['DAY*'],hue=data['CATEGORY*'], palette='rocket',
                order=data['DAY*'].value_counts().index)
plt.legend(bbox_to_anchor=(1.05, 0.95), loc=2)
plt.show()

### Category of Trip per Month

In [None]:
plt.figure(figsize=(9,6))
sns.countplot(data['MONTH*'],hue=data['CATEGORY*'], palette = 'viridis_r',
              order=data['MONTH*'].value_counts().index)
plt.show()

#### It can be seen clearly from the above graph that most of the trips made are Business across the month, only few Personal trips were made. Also, highest Business trips were made in the month of December.

#### While highest number of Personal trip were on March/ July

### It is necessary to find which hours of the day highest trips are made

In [None]:
data['TIME*'] = data['START_DATE*'].dt.hour

plt.figure(figsize=(20,10))
sns.countplot(data=data, x=data['TIME*'], hue=data['DAY*'],
              palette="viridis", )
plt.show()

#### From above plot, it can be seen that on Friday 11am - 12pm are peak hours of the trips

### Now we want to know how many trips were made on Day time and how many on Night time

In [None]:
a = pd.to_datetime(['18:00:00']).time

data['DAY/NIGHT*'] = data.apply(lambda x : 'Night Ride' if x['START_DATE*'].time() > a else 'Day Ride', axis=1)

sns.countplot(data['DAY/NIGHT*'], palette='cubehelix' , order = data['DAY/NIGHT*'].value_counts().index)
plt.show()

#### Maximum trips were made during Day Time

### Day/Night against Days

In [None]:
plt.figure(figsize=(9,7))
sns.countplot(data['DAY*'], hue=data['DAY/NIGHT*'], palette='viridis' ,
              order=data['DAY*'].value_counts().index)
plt.show()

#### Highest number of Day rides were on Friday while Night rides were on Sunday/Thursday

### Converting the continuous Miles into buckets

In [None]:
f = {}

for i in data['MILES*']:
    for i in data['MILES*']:
        if i < 10:
            f.setdefault(i,'0-10 miles')
        elif i >= 10 and i < 20:
            f.setdefault(i,'10-20 miles')
        elif i >= 20 and i < 30:
            f.setdefault(i,'20-30 miles')
        elif i >= 30 and i < 40:
            f.setdefault(i,'30-40 miles')
        elif i >= 40 and i < 50:
            f.setdefault(i,'40-50 miles')
        else:
            f.setdefault(i,'Above 50 miles')
            
data['MILES_BUCKET*'] = data['MILES*'].map(f)

plt.figure(figsize=(10,6))
sns.countplot(data['MILES_BUCKET*'], palette='cubehelix' ,
              order = data['MILES_BUCKET*'].value_counts().index)
plt.show()

#### One can see that there are more number of short distanced trips

In [None]:
plt.figure(figsize=(10,7))
sns.countplot(data['MILES_BUCKET*'], hue=data['DAY/NIGHT*'], palette='YlOrBr',
              order = data['MILES_BUCKET*'].value_counts().index)
plt.show()

#### Both highest number of Day and Night Rides were below 10miles of a distance