# **IMPORTS**

In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import plotly

In [None]:
sns.set(style='darkgrid')

def get_label(g):
  for p in g.patches:
    height = p.get_height()
    g.text(p.get_x()+p.get_width()/2.,
            height/2,
            '{}'.format(round(height)),
            ha="center", color='white')

# **LOADING DATA**

In [None]:
new_york = pd.read_csv('../input/explore-bike-share-data/new_york_city.csv')
washington = pd.read_csv('../input/explore-bike-share-data/washington.csv')
chicago = pd.read_csv('../input/explore-bike-share-data/chicago.csv')

In [None]:
new_york.head()

In [None]:
#Befere merge, i will creare a new column to identify which citie the dataset belongs 
new_york['location'] = new_york['Unnamed: 0'].apply(lambda x:'New York')
washington['location'] = washington['Unnamed: 0'].apply(lambda x:'Washington, D.C.')
chicago['location'] = chicago['Unnamed: 0'].apply(lambda x:'Chicago')

In [None]:
df_all = pd.merge(new_york, washington, how='outer')
df = pd.merge(chicago, df_all, how='outer')

# **DATA DESCRIPTION**

In [None]:
df.sample(5)

## **Data Dimesion**

In [None]:
print('number of rows: {}'. format(df.shape[0])) 
print('number of cols: {}'. format(df.shape[1]))

## **DataTypes**

In [None]:
df['Start_date'] = pd.to_datetime(df['Start Time'])
df['End_date'] = pd.to_datetime(df['End Time'])

In [None]:
#Drop irrelevant columns
df.drop(['Unnamed: 0', 'Start Time', 'End Time'], axis=1, inplace=True)

In [None]:
#Ordering
df = df[['location', 'Start_date', 'End_date', 'Trip Duration', 'Start Station', 'End Station', 'User Type', 'Gender','Birth Year']]

## **Rename Columns**

In [None]:
cols = ['location', 'Start_date', 'End_date', 'Trip_Duration', 'Start_station', 'End_Station', 'User_Type', 'Gender','Birth_Year']
df.columns = map(str.lower, cols)

## **Check NA**

In [None]:
df.isna().sum()

In [None]:
df.head()

## **Change Types**

In [None]:
df.sample()

In [None]:
df.dtypes

In [None]:
df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])

# **FEATURE ENGINEERING**

In [None]:
df1 = df.copy()

In [None]:
month_map = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', 7: 'Jul', 8: 'Aug', 9: 'Sept', 10: 'Oct', 11: 'Nov', 12: 'Dec'}

df1['month_Trip'] = df1['start_date'].dt.month.map(month_map)

In [None]:
day_week = month_map = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}

In [None]:
df1['day_week'] = df1['start_date'].dt.weekday.map(day_week)

In [None]:
#I will create a new column with the concatenation of Start Station and End Station, to compare completed trips
df1['full_trip'] = df1[['start_station', 'end_station']].agg(' at '.join, axis=1)

# **EXPLORATORY DATA ANALYSI**

In [None]:
df2 = df1.copy()

In [None]:
df2.sample()

## **Popular times of travel (i.e., occurs most often in the start time)**

1. most common month
2. most common day of week
3. most common hour of day

In [None]:
plt.figure(figsize=[15,9])
sns.countplot(x=df2['month_Trip'], hue=df2['location']);

In [None]:
plt.figure(figsize=[15,9])
sns.countplot(x=df2['day_week'], hue=df2['location']);

In [None]:
plt.figure(figsize=[20,9])
sns.countplot(x=df2['start_date'].dt.hour, hue=df2['location']);

**The most used time is in the morning (between 6 and 8 am) and in the afternoon (between 4 pm and 6 pm), which is known as the peak time.**

In [None]:
plt.figure(figsize=[20,9])
sns.countplot(x=df2['end_date'].dt.hour, hue=df2['location']);

## **USER INFO**

1. counts of each user type
2. counts of each gender (only available for NYC and Chicago)
3. earliest, most recent, most common year of birth (only available for NYC and Chicago)


In [None]:
df.sample()

In [None]:
mask_df = (df2['location'] != 'Washington, D.C.')
df_nychicago = df2.loc[mask_df]

In [None]:
plt.figure(figsize=[15,5])
sns.countplot(x='user_type', hue='location', data=df2);
plt.xlabel("User Type");
plt.title('Counts of each user type')

In [None]:
plt.figure(figsize=[15,5])
g = sns.countplot(x='gender', hue='location', data=df_nychicago);
plt.xlabel("Gender");
plt.title('Counts of each gender');
get_label(g)

In [None]:
df_nychicago['birth_year'].min()

In [None]:
plt.figure(figsize=[15,5])
sns.boxplot(x='birth_year', data=df_nychicago, hue='location')
plt.xlabel("Birth year");

HAS MANY OUTLIERS IN THE DATA SET

**I THINK IT DOESN'T MAKE SENSE TO EVALUATE THE TRIPS OF THE 3 CITIES TOGETHER, SO I WILL DO A SEPARATE ANALYSIS FOR EACH CITY**

## **Popular stations and trip**

1. most common start station
2. most common end station
3. most common trip from start to end (i.e., most frequent combination of start)

station and end station)

# **NEW YORK**

In [None]:
# Filter to each citie
mask_ny = (df2['location'] == 'New York')
df_ny = df2.loc[mask_ny]

In [None]:
df_ny.sample()

In [None]:
#get 10 most frequency to Start Station and End Station
freq_start = df_ny.groupby("start_station")['start_station'].count().nlargest(10)
freq_end = df_ny.groupby("end_station")['end_station'].count().nlargest(10)
freq_full_trip = df_ny.groupby("full_trip")['full_trip'].count().nlargest(10)

In [None]:
g = freq_start.plot(kind='bar',figsize=[15,5])
plt.title('10 Start Station most common - New York')
plt.xlabel('Start Station')
plt.ylabel("Trips")
get_label(g)

In [None]:
g = freq_end.plot(kind='bar',figsize=[15,5])
plt.title('10 End Station most common - New York')
plt.xlabel('End Station')
plt.ylabel("Trips")
get_label(g)

In [None]:
g = freq_full_trip.plot(kind='bar',figsize=[15,5], )
plt.title('10 full trip most common - New York')
plt.xlabel('Full Trip')
plt.ylabel("Trips")
get_label(g)

In [None]:
avarege_travel_ny = (df_ny['trip_duration'].sum() / len(df_ny)) / 60
print('Avarege Travel time: {} minutes'.format(round(avarege_travel_ny)))

# **WASHINGTON, D.C.**

In [None]:
mask_Wdc = (df2['location'] == 'Washington, D.C.')
df_wdc = df2.loc[mask_Wdc]

In [None]:
df_wdc.sample()

In [None]:
#get 10 most frequency to Start Station and End Station
freq_start_wdc = df_wdc.groupby("start_station")['start_station'].count().nlargest(10)
freq_end_wdc = df_wdc.groupby("end_station")['end_station'].count().nlargest(10)
freq_full_trip_wdc = df_wdc.groupby("full_trip")['full_trip'].count().nlargest(10)

In [None]:
g = freq_start_wdc.plot(kind='bar',figsize=[15,5])
plt.title('10 Start Station most common - Washington, D.C.')
plt.xlabel('Start Station')
plt.ylabel("Trips")
get_label(g)

In [None]:
g = freq_end_wdc.plot(kind='bar',figsize=[15,5])
plt.title('10 End Station most common - Washington, D.C.')
plt.xlabel('Start Station')
plt.ylabel("Trips")
get_label(g)

In [None]:
g = freq_full_trip_wdc.plot(kind='bar',figsize=[15,5])
plt.title('10 full trip most common - Washington, D.C.')
plt.xlabel('Full Trip')
plt.ylabel("Trips")
get_label(g)

In [None]:
#avarege_travel_wdc = (df_wdc['trip_duration'].sum() / len(df_wdc)) / 60
avarege_travel_wdc = (df_wdc['trip_duration'].mean()) / 60
print('Avarege Travel time: {} minutes'.format(round(avarege_travel_wdc)))

# **CHICAGO**

In [None]:
mask_ccg = (df2['location'] == 'Chicago')
df_chicago = df2.loc[mask_ccg]

In [None]:
#get 10 most frequency to Start Station and End Station
freq_start_ccg = df_chicago.groupby("start_station")['start_station'].count().nlargest(10)
freq_end_ccg = df_chicago.groupby("end_station")['end_station'].count().nlargest(10)
freq_full_trip_ccg = df_chicago.groupby("full_trip")['full_trip'].count().nlargest(10)

In [None]:
g = freq_start_ccg.plot(kind='bar',figsize=[15,5])
plt.title('10 Start Station most common - Chicago')
plt.xlabel('Start Station')
plt.ylabel("Trips")
get_label(g)

In [None]:
g = freq_end_ccg.plot(kind='bar',figsize=[15,5])
plt.title('10 End Station most common - Chicago')
plt.xlabel('End Station')
plt.ylabel("Trips")
get_label(g)

In [None]:
g = freq_full_trip_ccg.plot(kind='bar',figsize=[15,5])
plt.title('10 Full Trip most common - Chicago')
plt.xlabel('Full Trip')
plt.ylabel("Trips")
get_label(g)

In [None]:
#avarege_travel_wdc = (df_wdc['trip_duration'].sum() / len(df_wdc)) / 60
avarege_travel_ccg = (df_chicago['trip_duration'].mean()) / 60
print('Avarege Travel time: {} minutes'.format(round(avarege_travel_ccg)))

**I'm a beginner, forgive the mistakes.**