**Please comment for any suggestions or disagreement and upvote if you like my analysis**.  

In [None]:
#import required libraries
import numpy as np

In [None]:
import pandas as pd

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import matplotlib.cm
import matplotlib.dates as madates
plt.style.use('ggplot')

In [None]:
import seaborn as sns

In [None]:
from datetime import datetime

In [None]:
#package to be used for Basemap

from mpl_toolkits.basemap import Basemap
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
from matplotlib.colors import Normalize


In [None]:
# pacakage to be used for plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [None]:
init_notebook_mode(connected=True)

In [None]:
import cufflinks as cf

In [None]:
cf.go_offline()

In [None]:
#Load the dataset with Stations information

station = pd.read_csv('../input/station_info.csv')

In [None]:
station.head()

In [None]:
station.info()

In [None]:
station.describe()

In [None]:
#Check for any missing values
station[pd.isnull(station)].sum()

There is no 'Null Value' in station dataframe

# BART Station Locations on map

In [None]:
# Split variable 'Location' into Longitude and Latitude
# Pass values to map plotting function to create a map

location = station['Location']
def extractLonLat(location):
    location=location.str.split(',')
    longitude = location.str.get(0).astype(float)
    latitude = location.str.get(1).astype(float)
    extra = location.str.get(2)
    #print(extra)
    return longitude, latitude

def plot_data(lon, lat):
    fig, ax =plt.subplots(figsize=(10,20))

    m = Basemap(resolution ='c',
                     projection = 'merc',
                     lat_0 =37.75, lon_0=-122.42,
                     llcrnrlon=-122.534855, llcrnrlat=37.513119, urcrnrlon=-121.653202, urcrnrlat=38.053082)
    m.drawmapboundary(fill_color='#46bcec')
    m.fillcontinents(color='#f2f2f2', lake_color='aqua')#46bcec')#f2f2f2',
    m.drawcoastlines()
    i = 0
    while i < 46:
#         print(lat[i],lon[i])
        x, y = m(lon[i], lat[i])
        m.plot(x, y, marker='D', color='m')
        i +=1
   


def main():
    
    longitude, latitude =extractLonLat(location)
    plot_data(longitude, latitude)
   
if __name__ == '__main__':
    main()




# Year2016 Exploratory Analysis

In [None]:
year2016 = pd.read_csv('../input/date-hour-soo-dest-2016.csv')

In [None]:
year2016.head()

In [None]:
year2016.info()

In [None]:
year2016.describe()

In [None]:
categories = year2016.dtypes[year2016.dtypes == 'object'].index
year2016[categories].describe()

In [None]:
#Check for any missing values
year2016.isnull().sum()

There is no missing/null value.

In [None]:
# create columns and split DateTime column

year2016['DateTime'] = pd.to_datetime(year2016['DateTime'])
year2016['Year'] = year2016['DateTime'].dt.year
year2016['Month'] = year2016['DateTime'].dt.month_name()
year2016['Day of Month'] = year2016['DateTime'].dt.day
year2016['Day of Week'] = year2016['DateTime'].dt.day_name()
year2016['Time/Hour'] = year2016['DateTime'].dt.hour


In [None]:
year2016.head()

**Busiest stations based on Travellers on Yearly basis**

**Origin and Throughput**

In [None]:
#Get theme for cufflinks
cf.getThemes()

In [None]:
# dataframe o_df created to group Origin and calculated sum of throughput

o_df = year2016.groupby('Origin').sum()['Throughput']
o_df

In [None]:
# Set theme and plot graph
cf.set_config_file(theme='henanigans')
o_df.iplot(kind='bar')

 **Destination and Throughput**

In [None]:
# dataframe df_des_sum created to group variable 'Destination' and calculated sum of throughput

df_des_sum = year2016.groupby('Destination').sum()['Throughput']

In [None]:
# Set theme and plot graph
cf.set_config_file(theme='pearl')
df_des_sum.iplot(kind ='bar')

**Busiest stations based on BART train trips from one station to another**

In [None]:
# dataframe df_count created to group Origin and get count of Throughput

df_count = year2016.groupby('Origin').count()['Throughput']

In [None]:
# Set theme and plot graph
cf.set_config_file(theme='henanigans')
df_count.iplot(kind='bar')

**Destination and Throughput**

In [None]:
# dataframe df_des_sum created to group variable 'Destination' and calculated count of Throughput

df_des_cnt = year2016.groupby('Destination').count()['Throughput']

In [None]:
# Set theme and plot graph
cf.set_config_file(theme='pearl')
df_des_cnt.iplot(kind='bar')

**Busiest Months in 2016 based on Throughput**

In [None]:
m_df=year2016.groupby('Month').sum()['Throughput']

In [None]:
# Set theme and plot graph
cf.set_config_file(theme='henanigans')
m_df.iplot(kind='bar')

**Busiest Days in 2016 based on Throughput**

In [None]:
df_day_sum = year2016.groupby('Day of Week').sum()['Throughput']

In [None]:
# Set theme and plot graph
cf.set_config_file(theme='henanigans')
df_day_sum.iplot(kind='bar')

**As expected, weekdays are busy as compared to weekends. Wednesday is the busiest among all days but there is slight difference among other days of a week.**

**How many people take the BART late at night?**


In [None]:
# plot traffic in different hours of a day
df_time_sum = year2016.groupby('Time/Hour').sum()['Throughput']

In [None]:
cf.set_config_file(theme='henanigans')
df_time_sum.iplot(kind='bar')

As expected, people travel the most during office hours. The traffic is at peak around 8:00 am, slowly decreases by 10:00 am and remains constant till 3:00 pm. Traffic is again at peak in the evening between 5:00 pm - 6:00 pm and slowly decreases as the sun sets.
Around 6891570 people travel between 10:00 pm and 3:00 am in the morning on an average.

**Which is the best time to go to SF from WDUB?**

In [None]:
# Scenario: To go to EMBR (SF) from WDUB between 7:00 am - 5:00 pm, which is the best time to go?

df_dub_SF16 = year2016[(year2016.Origin=='WDUB') & (year2016.Destination== 'EMBR')]

In [None]:
df_dub_SF16[df_dub_SF16['Time/Hour'].between(7, 17)].groupby('Time/Hour').sum()['Throughput'].plot(kind='bar')

If you are commuting from WDUB to EMBR (San Francisco) between 7:00 am and 5:00 pm and want to avoid traffic, best time is 3:00 pm or 15.

**Year2017 Exploratory Analysis**

In [None]:
year2017 = pd.read_csv('../input/date-hour-soo-dest-2017.csv')

In [None]:
 categories = year2017.dtypes[year2017.dtypes == 'object'].index
year2017[categories].describe()

In [None]:
year2017.describe()

In [None]:
#Check for any missing values
year2017[pd.isnull(year2017)].sum()

In [None]:
# create columns and split DateTime column

year2017['DateTime'] = pd.to_datetime(year2017['DateTime'])
year2017['Year'] = year2017['DateTime'].dt.year
year2017['Month'] = year2017['DateTime'].dt.month_name()
year2017['Day of Month'] = year2017['DateTime'].dt.day
year2017['Day of Week'] = year2017['DateTime'].dt.day_name()
year2017['Time/Hour'] = year2017['DateTime'].dt.hour

In [None]:
year2017.head()

**Busiest stations based on Travellers on Yearly basis**


In [None]:
year2017.groupby('Origin').Throughput.sum().plot(kind = 'bar', figsize = (10,8), colormap='PiYG')
plt.title('Busiest BART stations')


In [None]:
year2017.groupby('Destination').Throughput.sum().plot(kind = 'bar', figsize = (10,8), colormap='Set2')
plt.title('Busiest BART stations')

# Busiest BART stations based on train trips from one station to another

In [None]:
year2017.groupby('Origin').Throughput.count().plot(kind = 'barh', figsize = (12,10), colormap='Spectral')
plt.title('Busiest BART stations based on Train Trips')

In [None]:
year2017.groupby('Destination').Throughput.count().plot(kind = 'barh', figsize = (12,10), colormap='Set2')
plt.title('Busiest BART stations based on Train Trips')

**Busiest Months in 2017 based on Throughput**

In [None]:
year2017.groupby('Month').Throughput.sum().plot(kind = 'bar', figsize = (10,8))

**2017 dataset has only 5 months data from which March is the busiest month in terms of BART travellers**

** Busiest Days in 2017 dataset**

In [None]:
#Busiest day of week 
year2017.groupby('Day of Week').Throughput.sum().plot(kind = 'bar', figsize = (10,8))

In [None]:
#Busiest time in a day 
year2017.groupby('Time/Hour').sum()['Throughput']

In [None]:
#Busiest time in a day 
year2017.groupby('Time/Hour').Throughput.sum().plot.bar(figsize = (10,8))

**Insights/Conclusions**

**Year2016**
Based on Origin, the busiest station is MONT followed by EMBR.
Based on Destination, the busiest station is EMBR followed by MONT.
Based on train trips- POWL is the busiest in both Origin and Destination. (This approach is used to show the number of times a train has been through various Origins and Destinations.)
Busiest month - AUGUST, slow- DECEMBER.
Busiest day- Wednesday, slow- Sunday

**Year2017**
Based on Origin, the busiest station is MONT followed by EMBR.
Based on Destination, the busiest station is EMBR followed by MONT.
Based on train trips- POWL is the busiest in both Origin and Destination.
Busiest month - MARCH, slow- FEBRUARY 
Busiest day- Wednesday, slow- Sunday