In [None]:
%pylab inline
import pandas as pd
import seaborn as sns

In [None]:
uber_data = pd.read_csv('../input/uber-pickups-in-new-york-city/uber-raw-data-aug14.csv')

## Loading data

In [None]:
uber_data.head()

## Converting date column to datetime

In [None]:
uber_data['Date/Time']=uber_data['Date/Time'].map(pd.to_datetime)

In [None]:
uber_data.head()

In [None]:
uber_data.info()

## Let's take day of month and create a column out of it

In [None]:
uber_data['Day'] = uber_data['Date/Time'].apply(lambda x: x.day)

In [None]:
uber_data['WeekDay'] = uber_data['Date/Time'].apply(lambda x: x.weekday())

In [None]:
uber_data['hour'] = uber_data['Date/Time'].apply(lambda x: x.hour)

In [None]:
uber_data.tail()

## We got all we need in terms of additioanal features. So, let's start basic analysis.

## Analysis of Day of Month

In [None]:

plt.figure(figsize=(10,6))
uber_data['Day'].hist(bins=30,rwidth=0.9,range=(0.5,30.5))
plt.xlabel('Day of Month')
plt.ylabel('frequency')
plt.title('Uber - Daily Frequency - Aug 2014')

In [None]:
for x,rows in uber_data.groupby('Day'):
    print((x,len(rows)))

In [None]:
## This is not so useful so we will write a function instead. A simple lambda func will do.

In [None]:
by_date = uber_data.groupby('Day').apply(lambda x: len(x))

In [None]:
by_date

In [None]:
plt.figure(figsize=(10,6))
by_date.plot()

In [None]:
by_date_sorted= by_date.sort_values()
by_date_sorted

In [None]:
plt.figure(figsize=(10,6))
bar(range(0,31),by_date_sorted)
plt.xlabel('Day of Month')
plt.ylabel('frequency')
plt.title('Uber - Daily Frequency - Aug 2014')
xticks(range(1,31),by_date_sorted.index)

As we see in the barplot above, the first day of August was the busiest for Uber trips. The 21st day was the runner-up. Maybe, there is a weekly trend. We'll explore that as we move along in the analysis.

## Analysis of Hour

In [None]:
plt.figure(figsize=(10,6))

uber_data.hour.hist(bins=24, range=(0,25))

In [None]:
by_hour = uber_data.groupby('hour').apply(lambda x: len(x))
by_hour_sorted = by_hour.sort_values()


plt.figure(figsize=(10,6))
bar(range(0,24),by_hour_sorted)
plt.xlabel('Hour of Day')
plt.ylabel('Frequency')
plt.title('Uber - Hourly - Thru Aug 2014')
xticks(range(0,24),by_hour_sorted.index)


As seen in the table, the busiest time of the day is 5 to 6 pm, which is the rush hour after work.

## Analysis of Weekday

In [None]:
plt.figure(figsize=(10,6))

hist(uber_data.WeekDay, bins=7,range=(-0.5,6.5),rwidth=0.8)
xticks(range(7),['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])


As the analysis shows, New Yorkers take Uber on Fridays most. This is just for the month in question which is August, 2014.

## Cross Analysis of WeekDay and Hour 

In [None]:
by_hour_week = uber_data.groupby(['WeekDay','hour']).apply(lambda x: len(x))

by_hour_weeek = by_hour_week.unstack()



by_hour_weeek

In [None]:
plt.figure(figsize=(10,6))

cmap = sns.cm.rocket_r

sns.heatmap(by_hour_weeek, annot=False, cmap=cmap)

The heatmap indicates that morning and afternoon hours are the most busiest as expected. People often use Uber as an alternative to public transportation while going/getting out of work.


However, it is interesting to see that people use Uber Friday afternoons more than they do on normal weekdays. Also, early mornings on weekends are busy among busiest time periods, indicating that people often use Uber to get out.

## Analysis of Location data points

In [None]:
print(uber_data['Lon'].max())
print(uber_data['Lon'].min())
print(uber_data['Lat'].max())
print(uber_data['Lat'].min())

In [None]:
plt.figure(figsize=(25,15))


plot(uber_data['Lon'], uber_data['Lat'], '.', ms=0.5)
xlim(-74.2, -73.7)
ylim(40.7,41)


As the built-in plot shows that almost all Uber trips originate in Manhattan region.