In [None]:
'''
1 ) "uber-raw-data-janjune-15.csv" ->> this data contains all the entries/pickups from 'January' to 'June'
    Quite huge dataset having approx 15M data pts , so lets consider its sample which have approx 1M


2 ) "uber-raw-data-janjune-15_sample.csv" ->> this data is a sample of "uber-raw-data-janjune-15.csv"
    'Since above data is quite huge ~15 Million data pts , hence it is good to work with some sample 
     if u do not have good specifications in your systems
    
'''

# 1.. Lets Read data for Analysis

In [3]:
### lets import all the necessary packages !

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
import os

In [None]:
os.listdir(r"Datasets/")

In [None]:
uber_15 = pd.read_csv(r"Datasets/uber-raw-data-janjune-15_sample.csv")

In [None]:
uber_15.shape

## 2.. Lets Perform Data pre-processing/Data cleaning !
        check data-type , check missing values , check whether duplicated values or not !
        ie Prepare Data for Analysis !

In [None]:
type(uber_15)

In [None]:
uber_15.duplicated().sum()

In [None]:
uber_15.drop_duplicates(inplace=True)

In [None]:
uber_15.duplicated().sum()

In [None]:
uber_15.shape

In [None]:
uber_15.dtypes

In [None]:
uber_15.isnull().sum()

In [None]:
uber_15['Pickup_date'][0]

In [None]:
type(uber_15['Pickup_date'][0])

In [None]:
uber_15['Pickup_date'] = pd.to_datetime(uber_15['Pickup_date'])

In [None]:
uber_15['Pickup_date'].dtype

In [None]:
uber_15['Pickup_date'][0]

In [None]:
type(uber_15['Pickup_date'][0])

In [None]:
uber_15.dtypes

# 3.. Which month have max. Uber pickups in New York City ?

In [None]:
uber_15

In [None]:
uber_15['month'] = uber_15['Pickup_date'].dt.month_name()

In [None]:
uber_15['month']

In [None]:
uber_15['month'].value_counts().plot(kind='bar')

In [None]:
'''
Inference : June seems to have max Uber Pickups 
'''

In [None]:
## extracting dervied features (weekday ,day ,hour ,month ,minute) from 'Pickup_date'..

uber_15['weekday'] = uber_15['Pickup_date'].dt.day_name()
uber_15['day'] = uber_15['Pickup_date'].dt.day
uber_15['hour'] = uber_15['Pickup_date'].dt.hour
uber_15['minute'] = uber_15['Pickup_date'].dt.minute

In [None]:
uber_15.head(4)

In [None]:
## pd.crosstab() is used to create pivot table ..

pivot = pd.crosstab(index=uber_15['month'] , columns=uber_15['weekday'])

In [None]:
pivot

In [None]:
## grouped-bar plot using Pandas ..
pivot.plot(kind='bar' , figsize=(8,6))

In [None]:
'''

On Saturday & Friday, u are getting more Uber pickups in each month , it seems that New Yorkers used to go for 
shopping , Malls , fun activities alot on these days

'''

# 4.. Lets Find out Hourly Rush in New york city on all days

In [None]:
summary = uber_15.groupby(['weekday' , 'hour'] , as_index=False).size()

In [None]:
summary

In [None]:
## pointplot between 'hour' & 'size' for all the weekdays..

plt.figure(figsize=(8,6))
sns.pointplot(x="hour" , y="size" , hue="weekday" , data=summary)

In [None]:
'''
It's interesting to see that Saturday and Sunday exhibit similar demand throughout the late night/morning/afternoon, 
but it exhibits opposite trends during the evening. In the evening, Saturday pickups continue to increase throughout the evening,
but Sunday pickups takes a downward turn after evening..

We can see that there the weekdays that has the most demand during the late evening is Friday and Saturday, 
which is expected, but what strikes me is that Thursday nights also exhibits very similar trends as Friday and Saturday nights.

It seems like New Yorkers are starting their 'weekends' on Thursday nights. :)


'''

# 5.. Which Base_number has most number of Active Vehicles ??

In [None]:
uber_15.columns

In [None]:
os.listdir(r"Datasets/")

In [None]:
uber_foil = pd.read_csv(r"Datasets/Uber-Jan-Feb-FOIL.csv")

In [None]:
uber_foil.shape

In [None]:
uber_foil.head(3)

In [None]:
### establishing the entire set-up of Plotly..

In [None]:
# pip install chart-studio ## chart_studio provides a web-service for hosting graphs!
# pip install plotly
# pip install --upgrade nbformat
# 

In [None]:
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.express as px

from plotly.offline import download_plotlyjs , init_notebook_mode , plot , iplot 
## iplot() when working in a Jupyter Notebook to display the plot in the notebook.
## U have to do a proper setup of plotly , otherwise plotly plots gets open in a web-browser instead of Jupyter notebook


In [None]:
init_notebook_mode(connected=True)

In [None]:
uber_foil.columns

In [None]:
px.box(x='dispatching_base_number' , y='active_vehicles' , data_frame=uber_foil)

In [None]:
### if u need distribution +  5-summary stats of data , its good to go with violinplot
px.violin(x='dispatching_base_number' , y='active_vehicles' , data_frame=uber_foil)

# 6.. Collect entire data & Make it ready for the Data Analysis..

In [5]:
files = os.listdir(r"Datasets/")[-8:]

In [None]:
files.remove('uber-raw-data-janjune-15.csv')

In [None]:
files

In [None]:
files.remove('uber-raw-data-janjune-15_sample.csv')

In [6]:
files

['uber-raw-data-apr14.csv',
 'uber-raw-data-aug14.csv',
 'uber-raw-data-janjune-15.csv',
 'uber-raw-data-janjune-15_sample.csv',
 'uber-raw-data-jul14.csv',
 'uber-raw-data-jun14.csv',
 'uber-raw-data-may14.csv',
 'uber-raw-data-sep14.csv']

In [1]:
#blank dataframe
final = pd.DataFrame()

path = r"Datasets/"

for file in files :
    current_df = pd.read_csv(path+'/'+file)
    final = pd.concat([current_df , final])

NameError: name 'pd' is not defined

In [None]:
final.shape

In [None]:
### After Collecting entire data ,u might ask is : Do we have duplicate entires in data ?
### We are going to remove duplicates data when the entire row is duplicated

In [None]:
### first lets figure out total observations where we have duplicate values..
final.duplicated().sum()

In [None]:
## drop duplicate rows ..
final.drop_duplicates(inplace=True)

In [None]:
final.shape

In [None]:
final.head(3)

## Dataset Information : 

### The dataset contains information about the Datetime, Latitude, Longitude and Base of each uber ride that happened in the month of July 2014 at New York City, USA

##### Date/Time : The date and time of the Uber pickup

##### Lat : The latitude of the Uber pickup

##### Lon : The longitude of the Uber pickup

##### Base : The TLC base company code affiliated with the Uber pickup

    The Base codes are for the following Uber bases:
    B02512 : Unter
    B02598 : Hinter
    B02617 : Weiter
    B02682 : Schmecken
    B02764 : Danach-NY


# 7.. at what locations of New York City we are getting rush ??

In [None]:
### ie where-ever we have more data-points or more density, it means more rush is at there !

In [None]:
rush_uber = final.groupby(['Lat' , 'Lon'] , as_index=False).size()

In [None]:
rush_uber.head(6)

In [None]:
#pip install folium

In [None]:
import folium

In [None]:
basemap = folium.Map()

In [None]:
basemap

In [None]:
from folium.plugins import HeatMap

In [None]:
HeatMap(rush_uber).add_to(basemap)

In [None]:
basemap

    We can see a number of hot spots here. Midtown Manhattan is clearly a huge bright spot
    & these are made from Midtown to Lower Manhattan followed by Upper Manhattan and the Heights of Brooklyn.

# 8.. Examine rush on Hour and Weekday ( Perform Pair wise Analysis )

In [None]:
final.columns

In [None]:
final.head(3)

In [None]:
final.dtypes

In [None]:
final['Date/Time'][0]

In [None]:
### converting 'Date/Time' feature into date-time..

final['Date/Time'] = pd.to_datetime(final['Date/Time'] , format="%m/%d/%Y %H:%M:%S")

In [None]:
final['Date/Time'].dtype

In [None]:
### extracting 'weekday' & 'hour' from 'Date/Time' feature..

final['day'] = final['Date/Time'].dt.day
final['hour'] = final['Date/Time'].dt.hour

In [None]:
final.head(4)

In [None]:
'''
Earlier we have learnt how to create pivot table using pd.crosstab() , now let me show u one more way to build
pivot_table without pd.crosstab()

'''

In [None]:
pivot = final.groupby(['day' , 'hour']).size().unstack()

In [None]:
pivot

### pivot table is all about  , we have Rows*columns & having value in each cell !

In [None]:
### styling dataframe

pivot.style.background_gradient()