In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import os

In [None]:
uber_apr14= pd.read_csv(r'../input/uber-pickups-in-ny-city/uber-pickups-in-new-york-city/uber-raw-data-apr14.csv',
                        encoding='utf-8')
uber_apr14.head()

In [None]:
files=os.listdir(r'../input/uber-pickups-in-ny-city/uber-pickups-in-new-york-city')
files

In [None]:
files=[files for files in files if 'uber-raw-data' in files]
files

In [None]:
files.remove('uber-raw-data-janjune-15.csv')
files

In [None]:
path=r'../input/uber-pickups-in-ny-city/uber-pickups-in-new-york-city'

#blank dataframe
final=pd.DataFrame()

for file in files:
    df=pd.read_csv(path+"/"+file,encoding='latin-1')
    final=pd.concat([df,final])
final.head()

In [None]:
final.shape

In [None]:
df=final.copy()
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df['Date/Time'] = pd.to_datetime(df['Date/Time'], format="%m/%d/%Y %H:%M:%S")
df.info()

In [None]:
df['weekday']=df['Date/Time'].dt.day_name()
df['day']=df['Date/Time'].dt.day
df['minute']=df['Date/Time'].dt.minute
df['month']=df['Date/Time'].dt.month
df['hour']=df['Date/Time'].dt.hour
df.info()

In [None]:
df.head()

In [None]:
df['Base'].unique()

In [None]:
df['day'].unique()

In [None]:
df['weekday'].unique()

# Analysis of journey by Week-days

In [None]:
import plotly.express as px
px.bar(x=df['weekday'].value_counts().index,
      y=df['weekday'].value_counts().values
      )

# Analysis by Hour

In [None]:
px.bar(x=df['hour'].value_counts().index,
      y=df['hour'].value_counts().values
      )

# Analysis of Rush of each hour in each month

In [None]:
for i,month in enumerate(df['month'].unique()):
    print(month)

In [None]:
plt.figure(figsize=(20,20))
for i,month in enumerate(df['month'].unique()):
    plt.subplot(3,2,i+1)
    plt.title("{}".format(month))
    df[df['month']==month]['hour'].hist(bins='auto')

# Analysis of which month has max rides

In [None]:
!pip install chart_studio

In [None]:
import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [None]:
trace1 = go.Bar( 
        x = df.groupby('month')['hour'].sum().index,
        y = df.groupby('month')['hour'].sum(),
        name= 'Priority')
iplot([trace1])

# Analysis of Journey of Each Day

In [None]:
plt.figure(figsize=(10,6))
plt.hist(df['day'], bins=30, rwidth=.8, range=(0.5, 30.5))
plt.xlabel('date of the month')
plt.ylabel('Total Journeys')
plt.title('Journeys by Month Day')
plt.grid()

# Analysis of Total rides month wise

In [None]:
plt.figure(figsize=(20,20))
for i,month in enumerate(df['month'].unique(),1):
    plt.subplot(3,2,i)
    df_out=df[df['month']==month]
    plt.title("{}".format(month))
    plt.hist(df_out['day'],bins='auto')
    plt.xlabel('days in month'.format(i))
    plt.ylabel('total rides')
    plt.grid()

# Getting Rush in hour

In [None]:
plt.figure(figsize=(10,6))
sns.set_style(style='whitegrid')
sns.pointplot(x="hour",y="Lat",data=df)
plt.grid()

In [None]:
plt.figure(figsize=(10,6))
ax=sns.pointplot(x="hour",y="Lat", hue="weekday",data=df)
ax.set_title('hoursoffday vs latiitide of passenger')
plt.grid()

# To analyse which base number gets popular by month name

In [None]:
df.head()

In [None]:
df['Base'].head()

In [None]:
df.groupby(['Base','month'])['Date/Time'].count()

In [None]:
base=df.groupby(['Base','month'])['Date/Time'].count().reset_index()
base

# To analyse which base number gets popular by month name

In [None]:
plt.figure(figsize=(10,6))
sns.lineplot(x='month',y='Date/Time',hue='Base',data=base);

# Heatmap by Hour and Weekday.

In [None]:
def count_rows(rows):
    return len(rows)

In [None]:
by_cross = df.groupby(['weekday','hour']).apply(count_rows)
by_cross

In [None]:
pivot=by_cross.unstack()
pivot

In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(pivot, annot=False);

In [None]:
df.head()

In [None]:
def heatmap(col1,col2):
    by_cross = df.groupby([col1,col2]).apply(lambda x:len(x))
    pivot=by_cross.unstack()
    plt.figure(figsize=(10,6))
    return sns.heatmap(pivot,annot=False)

In [None]:
## validating above Analysis through Heatmap
heatmap('day','hour');

In [None]:
heatmap('day','month');

We observe that the number of trips increases each month, we can say that from April to September 2014, Uber was in a continuous improvement process.

In [None]:
df[df['month']==4].head()

In [None]:
heatmap('weekday','month');

# Analysis of Location data points

In [None]:
from folium.plugins import HeatMap

In [None]:
import folium
from folium.plugins import HeatMap
basemap=folium.Map()

In [None]:
BBox = (df.Lon.min(),df.Lon.max(),
         df.Lat.min(), df.Lat.max())
BBox

In [None]:
ny_m = plt.imread('../input/new-york-city/map.png')

In [None]:
fig, ax = plt.subplots(figsize = (20,6))
plt.plot(df['Lon'], df['Lat'],'r+', ms=0.5)
ax.set_title('Plotting Spatial Data on New York Map')
ax.set_xlim(BBox[0],BBox[1])
ax.set_ylim(BBox[2],BBox[3])
ax.imshow(ny_m, zorder=0, extent = BBox, aspect= 'equal')
plt.show()

# Perform Spatial Analysis using heatmap to get a clear cut of Rush on Sunday(Weekend)

In [None]:
df.head()

In [None]:
df_out=df[df['weekday']=='Sunday']
df_out.head()

In [None]:
df_out.groupby(['Lat','Lon'])['weekday'].count().reset_index()

In [None]:
HeatMap(df_out.groupby(['Lat','Lon'])['weekday'].count().reset_index(),zoom=20,radius=15).add_to(basemap)
basemap

In [None]:
BBox = (df_out.Lon.min(),df_out.Lon.max(),
         df_out.Lat.min(), df_out.Lat.max())
BBox

In [None]:
fig, ax = plt.subplots(figsize = (20,6))
plt.plot(df_out['Lon'], df_out['Lat'],'r+', ms=0.5)
ax.set_title('Plotting Spatial Data on New York Map')
ax.set_xlim(BBox[0],BBox[1])
ax.set_ylim(BBox[2],BBox[3])
ax.imshow(ny_m, zorder=0, extent = BBox, aspect= 'equal')
plt.show()

In [None]:
def plot(df,day):
    df_out=df[df['weekday']==day]
    df_out.groupby(['Lat','Lon'])['weekday'].count().reset_index()
    HeatMap(df_out.groupby(['Lat','Lon'])['weekday'].count().reset_index(),zoom=20,radius=15).add_to(basemap)
    return basemap

In [None]:
plot(df,'Saturday')

# Analysis of Jan-June uber_15

In [None]:
uber_15 = pd.read_csv(r'../input/uber-pickups-in-ny-city/uber-pickups-in-new-york-city/uber-raw-data-janjune-15.csv',
                      encoding='utf-8')
uber_15.head()

In [None]:
uber_15.shape

In [None]:
#Checking the minimum date in the uber_15
uber_15['Pickup_date'].min()

In [None]:
#Checking the maximum date in the uber_15
uber_15['Pickup_date'].max()

In [None]:
uber_15['Pickup_date'] =  pd.to_datetime(uber_15['Pickup_date'], format='%Y-%m-%d %H:%M:%S')
uber_15.head()

In [None]:
uber_15['weekday']=uber_15['Pickup_date'].dt.day_name()
uber_15['day']=uber_15['Pickup_date'].dt.day
uber_15['minute']=uber_15['Pickup_date'].dt.minute
uber_15['month']=uber_15['Pickup_date'].dt.month
uber_15['hour']=uber_15['Pickup_date'].dt.hour
uber_15.head()

In [None]:
px.bar(x=uber_15['month'].value_counts().index,
           y=uber_15['month'].value_counts().values)

We can see that the number of Uber pickup has been steadily increasing throughout the first half of 2015 in NYC


# Analysing Rush in New york City

In [None]:
plt.figure(figsize=(20,6))
ax=sns.countplot(data=uber_15,x='hour')
plt.grid()

Interestingly, after the morning rush, the number of Uber pickups doesn't dip much throughout the rest of the morning and early afternoon. There is significantly more demand in the evening than the daytime. Let's investigate to see if there's a difference in hourly pattern for different days of the week.

# Analysing In-Depth Analysis of Rush in New york City Day & hour wise

In [None]:
uber_15.groupby(['weekday', 'hour'])['Pickup_date'].count()

In [None]:
summary=uber_15.groupby(['weekday', 'hour'])['Pickup_date'].count().reset_index()
summary

In [None]:
summary=summary.rename(columns = {'Pickup_date':'Counts'})
summary

In [None]:
plt.figure(figsize=(20,6))
sns.pointplot(x="hour", y="Counts", hue="weekday", data=summary)
plt.grid()

In [None]:
uber_foil=pd.read_csv(r'../input/uber-pickups-in-ny-city/uber-pickups-in-new-york-city/Uber-Jan-Feb-FOIL.csv')
uber_foil.head()

In [None]:
uber_foil['dispatching_base_number'].unique()

In [None]:
plt.figure(figsize=(20,6))
sns.boxplot(x = 'dispatching_base_number', y = 'active_vehicles', data = uber_foil)
plt.show()

# Seems to have more number of Active Vehicles in B02764

In [None]:
plt.figure(figsize=(20,6))
sns.violinplot(x = 'dispatching_base_number', y = 'active_vehicles', data = uber_foil)
plt.show()

Seems to have more number of trips in B02764

In [None]:
# Finding the ratio of trips/active_vehicles
uber_foil['trips/vehicle'] = uber_foil['trips']/uber_foil['active_vehicles']
uber_foil.head()

In [None]:
uber_foil.set_index('date').head()

# How Average trips/vehicle inc/decreases with dates with each of base number

In [None]:
plt.figure(figsize=(20,6))
uber_foil.set_index('date').groupby(['dispatching_base_number'])['trips/vehicle'].plot()
plt.ylabel('Average trips/vehicle')
plt.title('Demand vs Supply chart (Date-wise)')
plt.legend();