## Analisis exploratorio - SF Bay Area Bike Share

In [None]:
%matplotlib inline

import datetime as datetime
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (15,5)

In [None]:
trips = pd.read_csv('../input/sf-bay-area-bike-share/trip.csv', low_memory=False)

trips.columns.values

print(trips.shape)

print(trips.isnull().any())

print(trips.describe())

In [None]:
trips.start_date = pd.to_datetime(trips.start_date, format='%m/%d/%Y %H:%M')
trips.end_date = pd.to_datetime(trips.end_date, format='%m/%d/%Y %H:%M')

trips.dtypes

In [None]:
trips_con_fecha = trips

# hago un split de la fecha y la hora
trips_con_fecha['year'] = pd.DatetimeIndex(trips_con_fecha['start_date']).year
trips_con_fecha['month'] = pd.DatetimeIndex(trips_con_fecha['start_date']).month
trips_con_fecha['day'] = pd.DatetimeIndex(trips_con_fecha['start_date']).day
trips_con_fecha['hour'] = pd.DatetimeIndex(trips_con_fecha['start_date']).hour
trips_con_fecha['minutes'] = pd.DatetimeIndex(trips_con_fecha['start_date']).minute

## Cantidad de trips por año

In [None]:
%matplotlib notebook
tri=trips_con_fecha.groupby(by='year').count()['id']
#tri=trips_con_fecha['anio'].value_counts().describe()
ax1=tri.plot(kind='bar',title='Cantidad por anio',color='green')
ax1.set_ylabel('Cantidad')
ax1.set_xlabel('Anio')

## Cantidad por dia de la semana

In [None]:
%matplotlib notebook
trips_con_fecha['day']= trips_con_fecha['start_date'].dt.dayofweek
tri=trips_con_fecha.groupby(by='day').count()['id']

ax2=tri.plot(kind='bar',title='Cantidad por dia de la semana',color='green')
ax2.set_ylabel('Cantidad')
ax2.set_xlabel('Dia')

## Cantidad por tipo de suscripcion

In [None]:
# Cantidad total de cada suscripcion
%matplotlib notebook
sub=trips_con_fecha['subscription_type'].value_counts()
su=sub.plot(kind='bar',title='Cantidad por suscripcion',fontsize= 8, color='green')
su.set_xlabel('Tipo de Suscripcion')
su.set_ylabel('Cantidad')
plt.xticks(rotation=0)

In [None]:
%matplotlib notebook
#sub=trips_con_fecha['hora'].value_counts()
sub=trips_con_fecha.groupby(by='hour').count()['id']
su2=sub.plot(kind='bar',title='Cantidad por hora',color='green')
su2.set_xlabel('Hora')
su2.set_ylabel('Cantidad')

## Duraciones en minutos

In [None]:
%matplotlib notebook
trips_con_fecha['duration_minutes']=trips_con_fecha['duration']/60

plt.figure(figsize = [9,7])
h = plt.hist(trips_con_fecha['duration_minutes'].values,range = [0,90],alpha = .5,bins=7,facecolor='green')
plt.title('Duraciones en minutos')
for spine in plt.gca().spines.values():
    spine.set_visible(False)

plt.xlabel('Duracion')
plt.ylabel('Frecuencia')
plt.legend()

## Porcentaje de bicicletas sacadas que no cumplieron el maximo de 30 minutos

In [None]:
val=trips_con_fecha.loc[trips_con_fecha['duration'] > 1800,:]['duration'].count()/float(trips_con_fecha['duration'].count())   
print(val*100)

## Porcentaje de bicicletas sacadas que no cumplieron el maximo de 30 minutos para 24-hour or 3-day membership

In [None]:
val=trips_con_fecha.loc[(trips_con_fecha['duration'] > 1800) & (trips_con_fecha['subscription_type'] == 'Customer'),:]\
                        ['duration'].count()/float(trips_con_fecha['duration'].count())   
print(val*100)

## Porcentaje de bicis sacadas que no cumplieron el maximo de 30 minutos para annual membership

In [None]:
val=trips_con_fecha.loc[(trips_con_fecha['duration'] > 1800) & (trips_con_fecha['subscription_type'] == 'Subscriber'),:]\
                        ['duration'].count()/float(trips_con_fecha['duration'].count())   
print(val*100)

## Ratio: cantidad de bicis devueltas/cantidad de bici alquiladas en una estacion

In [None]:
# Cantidad total de trips que salieron de cada estacion
estacion_count=trips_con_fecha['start_station_name'].value_counts()
ES=trips_con_fecha.loc[trips_con_fecha['start_station_name']==trips_con_fecha['end_station_name'],:]
trips_con_mismaE=ES['start_station_name'].value_counts()
ratioMISMAES=trips_con_mismaE/ estacion_count
top20=ratioMISMAES.sort_values(ascending=False)[:20]
top20

## Top 20 de las estaciones desde las cuales salen mas bicicletas

In [None]:
%matplotlib notebook

count_start_station = trips['start_station_id'].value_counts()
count_start_station

css = count_start_station[:20].plot('bar', title = 'Top 20 estaciones origen')
css.set_ylabel('Cantidad')
css.set_xlabel('Id estacion')

## Top 20 de las estaciones a las cuales llegan más bicicletas

In [None]:
%matplotlib notebook

count_end_station = trips['end_station_id'].value_counts()
count_end_station

ces = count_end_station[:20].plot('bar', title = 'Top 20 estaciones destino' )
ces.set_ylabel('Cantidad')
ces.set_xlabel('Id estacion')

## Las 10 estaciones con mas bicicletas de salida (de lunes a viernes)

In [None]:
#top_salidas_semana=trips_con_fecha[['date','start_station_name']]
# Los dias van de 0 a 6
semana=trips_con_fecha.loc[(trips_con_fecha['day']== 0) | (trips_con_fecha['day']== 1) |(trips_con_fecha['day']== 2) | (trips_con_fecha['day']== 3)| (trips_con_fecha['day']== 4),:]
semana_stats=semana['start_station_name'].value_counts()

In [None]:
top10_salida_lu_vie=semana_stats.sort_values(ascending=False)[:10]
top10_salida_lu_vie

## Las 10 estaciones con menos bicicletas de salida (de lunes a viernes)

In [None]:
top10_lu_vie=semana_stats.sort_values(ascending=True)[:10]
top10_lu_vie

## Las 10 estaciones con mas bicicletas de salida (sabado y domingo)

In [None]:
finde=trips_con_fecha.loc[(trips_con_fecha['day']==5) | (trips_con_fecha['day']==6),:]
finde_stats=finde['start_station_name'].value_counts()
top10=finde_stats.sort_values(ascending=False)[:10]
top10

## Las 10 estaciones con menos bicicletas de salida (sabado y domingo)

In [None]:
top10_Sab_Dom=finde_stats.sort_values(ascending=True)[:10]
top10_Sab_Dom

## Top 15 viajes más comunes según origen y destino

In [None]:
%matplotlib notebook

common_trips = trips[['start_station_id','end_station_id']]

common_trips.start_station_id = common_trips.start_station_id.astype(str)
common_trips.end_station_id = common_trips.end_station_id.astype(str)

common_trips['start_end'] = (common_trips.start_station_id + '-' + common_trips.end_station_id)

aux = common_trips['start_end'].value_counts()
ct = aux[:15].plot(kind='bar',title='Top 15 viajes mas comunes',color='green')
ct.set_ylabel('Cantidad')
ct.set_xlabel('Origen - Destino')

## Las 10 estaciones mas populares de salida de dia laboral en horario pico(7 a 9 y 16 a 18)

In [None]:
trips_con_fecha['yearE'] = pd.DatetimeIndex(trips_con_fecha['end_date']).year
trips_con_fecha['monthE'] = pd.DatetimeIndex(trips_con_fecha['end_date']).month
trips_con_fecha['dayE'] = pd.DatetimeIndex(trips_con_fecha['end_date']).day
trips_con_fecha['hourE'] = pd.DatetimeIndex(trips_con_fecha['end_date']).hour
trips_con_fecha['minutesE'] = pd.DatetimeIndex(trips_con_fecha['end_date']).minute

semana_hora_pico=trips_con_fecha.loc[((trips_con_fecha['hourE']>= 7 )& (trips_con_fecha['hourE']<=9)) |((trips_con_fecha['hourE']>= 16) & \
                    (trips_con_fecha['hourE']<=18)),:]

estacionesHorarioPico=semana_hora_pico['start_station_name'].value_counts()
top10estacionesHorarioPico=estacionesHorarioPico.sort_values(ascending=False)[:10]
top10estacionesHorarioPico

## Las 10 viajes mas populares de dia de semana en horario pico(7 a 9 y 16 a 18)

In [None]:
viajesPopulares = semana_hora_pico[['start_station_name','end_station_name','id']].groupby(['start_station_name','end_station_name'])['id'].count()
top20=viajesPopulares.sort_values(ascending=False)[:10]
top20

## Top10 de viajes con mayor promedio de duracion, de dia de semana y en horario pico

In [None]:
info_viajes_Populares = semana_hora_pico[['start_station_name','end_station_name','duration']].groupby(['start_station_name','end_station_name'])['duration'].agg([np.size,np.mean,np.std])
info_viajes_Populares.loc[info_viajes_Populares['size']>2000,:].sort_values('mean',ascending=False).head(10)

## Top10 de viajes de dia de semana y en horario pico con duracion mas variable

In [None]:
info_viajes_Populares.loc[info_viajes_Populares['size']>2000,:].sort_values('std',ascending=False).head(10)

## Los 10 viajes mas populares de fin de semana

In [None]:
weekend=trips_con_fecha.loc[(trips_con_fecha['day']== 5) | (trips_con_fecha['day']== 6),: ]
weekend_popular_trips = weekend[['start_station_name','end_station_name','id']].groupby(['start_station_name','end_station_name'])['id'].count()
top10weekend=weekend_popular_trips.sort_values(ascending=False)[:10]
top10weekend

# Weather

# Dates with more temperature:

In [None]:
chunks2=pd.read_csv('../input/sf-bay-area-bike-share/weather.csv',sep=',',iterator=True,chunksize=3000)
weather = pd.concat([chunk for chunk in chunks2])

weather.loc[weather['events'] == 'rain', 'events'] = "Rain"
weather.loc[weather['events'].isnull(), 'events'] = "None"

weather2=weather
print('El valor máximo de temperatura es:')
print(weather['max_temperature_f'].max())

In [None]:
max_temp_weather = weather.groupby(['max_temperature_f'], sort=True)['date'].max()
max_temp_weather = max_temp_weather.reset_index()
max_temp_weather.tail()

In [None]:
indexed_max_temp_weather = max_temp_weather.set_index(max_temp_weather['date'])
indexed_max_temp_weather.tail()

In [None]:
indexed_max_temp_weather_last5 = indexed_max_temp_weather.tail(5)

In [None]:
weather2.sort("max_temperature_f", ascending=False).head()

In [None]:
print('El dia con mayor temperatura fue el 7/27/2015 con 102')

In [None]:
%matplotlib notebook
plt.rcParams['figure.figsize'] = (10, 10)

su = indexed_max_temp_weather_last5.plot(kind='bar',title='Maximas temperaturas segun el dia',color='grey')

plt.xlabel('Dia')
plt.ylabel('Temperatura')

# Dates with less temperature:

In [None]:
print(weather.groupby(['min_temperature_f'], sort=True)['date'].min())

In [None]:
weather.sort("min_temperature_f", ascending=True).head()

In [None]:
min_temp_weather = weather.groupby(['min_temperature_f'], sort=True)['date'].max()
min_temp_weather = min_temp_weather.reset_index()
min_temp_weather.head()

In [None]:
indexed_min_temp_weather = min_temp_weather.set_index(min_temp_weather['date'])
indexed_min_temp_weather.head()

In [None]:
indexed_min_temp_weather_first5 = indexed_min_temp_weather.head(5)
indexed_min_temp_weather_first5.head()

In [None]:
%matplotlib notebook
plt.rcParams['figure.figsize'] = (10, 10)

su = indexed_min_temp_weather_first5.plot(kind='bar',title='Minimas temperaturas segun su fecha',color='grey')

plt.xlabel('Fecha')
plt.ylabel('Temperatura')

In [None]:
weatherWithDates = weather
weatherWithDates['dates']= pd.to_datetime(weather['date'], format = '%m/%d/%Y')

weatherWithDates['anio'] = pd.DatetimeIndex(weather['date']).year
weatherWithDates['mes'] = pd.DatetimeIndex(weather['date']).month
weatherWithDates['dia'] = pd.DatetimeIndex(weather['date']).day
weatherWithDates['hora'] = pd.DatetimeIndex(weather['date']).hour
weatherWithDates['minutos'] = pd.DatetimeIndex(weather['date']).minute

print(weatherWithDates.describe())

In [None]:
weatherWithDates.head()

In [None]:
weather_2015 = weatherWithDates[weatherWithDates['anio'] == 2015]
weather_2015.head()
weather_2015['events'].unique()

In [None]:
# Rain - 2015

weather_2015_rain = weather_2015[weather_2015['events'] == 'Rain']
weather_2015_rain.head()

In [None]:
weather_2015_rain.shape # Hay 76 dias de lluvia en todo el 2015

In [None]:
weather_2015_rain_enero = weather_2015_rain[weather_2015_rain['mes'] == 1]

In [None]:
weather_2015_rain_enero.head()

In [None]:
weather_2015_rain_enero.shape #Llovio solo 1 dia en enero del 2015

In [None]:
weather_2015_rain_febrero = weather_2015_rain[weather_2015_rain['mes'] == 2]
weather_2015_rain_febrero.shape #Llovieron 17 dias en Febrero del 2015

In [None]:
weather_2015_rain_marzo = weather_2015_rain[weather_2015_rain['mes'] == 3]
weather_2015_rain_marzo.shape #Llovieron 15 dias en marzo 2015

In [None]:
weather_2015_rain_abril = weather_2015_rain[weather_2015_rain['mes'] == 4]
weather_2015_rain_abril.shape #Llovieron 15 dias en abril 2015

In [None]:
weather_2015_rain_mayo = weather_2015_rain[weather_2015_rain['mes'] == 5]
weather_2015_rain_mayo.shape #Llovieron 12 dias en mayo 2015

In [None]:
weather_2015_rain_junio = weather_2015_rain[weather_2015_rain['mes'] == 6]
weather_2015_rain_junio.shape #Llovieron 5 dias en junio 2015

In [None]:
weather_2015_rain_julio = weather_2015_rain[weather_2015_rain['mes'] == 7]
weather_2015_rain_julio.shape #Llovieron 8 dias en julio 2015

In [None]:
weather_2015_rain_agosto = weather_2015_rain[weather_2015_rain['mes'] == 8]
weather_2015_rain_agosto.shape #Llovieron 3 dias en agosto 2015

In [None]:
weather_2015_rain_septiembre = weather_2015_rain[weather_2015_rain['mes'] == 9]
weather_2015_rain_septiembre.shape #No llovio en septiembre 2015

In [None]:
weather_2015_rain_octubre = weather_2015_rain[weather_2015_rain['mes'] == 10]
weather_2015_rain_octubre.shape #No llovio en octubre 2015

In [None]:
weather_2015_rain_noviembre = weather_2015_rain[weather_2015_rain['mes'] == 11]
weather_2015_rain_noviembre.shape #No llovio en noviembre 2015

In [None]:
weather_2015_rain_diciembre = weather_2015_rain[weather_2015_rain['mes'] == 12]
weather_2015_rain_diciembre.shape #No llovio en diciembre 2015

In [None]:
weather_2015_months_rain = weather_2015_rain.groupby(by='mes')['mes'].count()

In [None]:
weather_2015_months_rain.head()

In [None]:
%matplotlib notebook
plt.rcParams['figure.figsize'] = (10, 10)

su = weather_2015_months_rain.plot(kind='bar',title='Cantidad de veces que llovio en cada mes del 2015',color='grey')

plt.ylabel('Cantidad de dias que llovio')
plt.xlabel('Mes')

In [None]:
# Rain Thunderstorm - 2015

weather_2015_rain_thunder = weather_2015[weather_2015['events'] == 'Rain-Thunderstorm']
weather_2015_rain_thunder.shape # Hubo 2 dias de tormenta en todo el 2015
weather_2015_rain_thunder.head() #Como son 2 no hace falta el head()

In [None]:
# Fog Rain - 2015

weather_2015_fog_rain = weather_2015[weather_2015['events'] == 'Fog-Rain']
weather_2015_fog_rain.shape # Hubo 3 dias de fog - rain en todo el 2015
weather_2015_fog_rain.head()

In [None]:
# Fog - 2015

weather_2015_fog = weather_2015[weather_2015['events'] == 'Fog']
weather_2015_fog.shape # Hubo 34 dias de fog en todo el 2015
weather_2015_fog.head()

In [None]:
weather_2015_months_fog = weather_2015_fog.groupby(by='mes')['mes'].count()
weather_2015_months_fog.head()

In [None]:
%matplotlib notebook
plt.rcParams['figure.figsize'] = (10, 10)

su = weather_2015_months_fog.plot(kind='bar',title='Cantidad de veces que hubo niebla en cada mes del 2015',color='grey')

plt.ylabel('Cantidad de dias que hubo niebla')
plt.xlabel('Mes')

## Cantidad de eventos climáticos registrados en cada ciudad

In [None]:
weatherSF = weather.loc[weather.zip_code == 94107,:]

%matplotlib notebook
sub_weatherSF = weatherSF['events'].value_counts()
sub_plot=sub_weatherSF.plot(kind='bar',title='Eventos registrados en la ciudad de San Francisco',color='orange')
sub_plot.set_xlabel('Evento')
sub_plot.set_ylabel('Cantidad')
plt.xticks(rotation=0)

In [None]:
weatherSJ = weather.loc[weather.zip_code == 95113,:]

%matplotlib notebook
sub_weatherSJ = weatherSJ['events'].value_counts()
sub_plot=sub_weatherSJ.plot(kind='bar',title='Eventos registrados en la ciudad de San Jose',color='orange')
sub_plot.set_xlabel('Evento')
sub_plot.set_ylabel('Cantidad')
plt.xticks(rotation=0)

In [None]:
weatherMV = weather.loc[weather.zip_code == 94041,:]

%matplotlib notebook
sub_weatherMV = weatherMV['events'].value_counts()
sub_plot=sub_weatherMV.plot(kind='bar',title='Eventos registrados en la ciudad de Mountain View',color='orange')
sub_plot.set_xlabel('Evento')
sub_plot.set_ylabel('Cantidad')
plt.xticks(rotation=0)

In [None]:
weatherRC = weather.loc[weather.zip_code == 94063,:]

%matplotlib notebook
sub_weatherRC = weatherRC['events'].value_counts()
sub_plot=sub_weatherRC.plot(kind='bar',title='Eventos registrados en la ciudad de Redwood City',color='orange')
sub_plot.set_xlabel('Evento')
sub_plot.set_ylabel('Cantidad')
plt.xticks(rotation=0)

In [None]:
weatherPA = weather.loc[weather.zip_code == 94301,:]

%matplotlib notebook
sub_weatherPA = weatherPA['events'].value_counts()
sub_plot=sub_weatherPA.plot(kind='bar',title='Eventos registrados en la ciudad de Palo Alto',color='orange')
sub_plot.set_xlabel('Evento')
sub_plot.set_ylabel('Cantidad')
plt.xticks(rotation=0)

In [None]:
weather.events.value_counts()

# Trips y Weather

In [None]:
weather[:2]
weather.isnull().sum()
weather.zip_code.unique()

In [None]:
# Para filtrar aquellos viajes que duran menos de 24 hs (menos de 86400 segundos)
trips_menos_1 = trips.loc[trips.duration < 86400,:]

trips_menos_1.tail(12)

In [None]:
# Para separar por anio , mes y dia
trips_menos_1['year'] = pd.DatetimeIndex(trips_menos_1['start_date']).year
trips_menos_1['month'] = pd.DatetimeIndex(trips_menos_1['start_date']).month
trips_menos_1['day'] = pd.DatetimeIndex(trips_menos_1['start_date']).day

trips_menos_1.head(15)

In [None]:
weather.date = pd.to_datetime(weather.date, format='%m/%d/%Y')

# Para separar por anio , mes y dia
weather['year'] = pd.DatetimeIndex(weather['date']).year
weather['month'] = pd.DatetimeIndex(weather['date']).month
weather['day'] = pd.DatetimeIndex(weather['date']).day

In [None]:
#Veo que zipcode tiene menor cantidad de nulls porque para cada fecha hay un registro por zipcode
for zip_code in weather.zip_code.unique():
    print (zip_code)
    print (weather[weather.zip_code == zip_code].isnull().sum())

In [None]:
# Me quedo con el zip code 94107 pues es el que presenta menos cantidad de valores nulos 
weather_94107 = weather.loc[weather['zip_code'] == 94107 , :]

weather_94107.head(10)

In [None]:
trips_weather_94107 = pd.merge(trips_menos_1,weather_94107,how='left',on=['year','month','day'])

trips_weather_94107.head(20)

## Correlacion entre duracion promedio en minutos y temperatura promedio en Fahrenheit

In [None]:
# Convierto la serie obtenida en un Dataframe para el plot
trips_weather_94107['duration_minutos']=trips_weather_94107['duration']/60
trip6965 = trips_weather_94107.loc[(trips_weather_94107['start_station_id'] == 69) & (trips_weather_94107['end_station_id'] == 65), :]

aux_df = trip6965.groupby('mean_temperature_f')['duration_minutos'].mean().to_frame()

aux_df.reset_index(inplace=True)
aux_df.columns = ['mean_temperature_f','mean_duration']
%matplotlib notebook

plt.figure(figsize = [9,7])

ax =sns.regplot(x=aux_df['mean_temperature_f'], y=aux_df['mean_duration'], data=aux_df)
ax.set_title('Duracion promedio por temperatura promedio')
plt.ylabel('Duracion promedio (minutos)')
plt.xlabel('Temperatura promedio (Farenheit)')

## Correlacion entre la velocidad promedio del viento y la duracion promedio del trip mas popular

In [None]:
aux_df2 = trip6965.groupby('mean_wind_speed_mph')['duration_minutos'].mean().to_frame()

aux_df2.reset_index(inplace=True)
aux_df2.columns = ['mean_wind_speed_mph','mean_duration']

%matplotlib notebook

plt.figure(figsize = [9,7])

bx =sns.regplot(x=aux_df2['mean_wind_speed_mph'], y=aux_df2['mean_duration'], data=aux_df2)
bx.set_title('Duracion promedio por velocidad promedio del viento')
plt.ylabel('Duracion promedio (minutos)')
plt.xlabel('Velocidad promedio del viento (mph)')

In [None]:
## Correlacion entre la velocidad maxima de las rafagas de viento y la duracion promedio en minutos para el trip mas popular

In [None]:
aux_df3 = trip6965.groupby('max_gust_speed_mph')['duration_minutos'].mean().to_frame()

aux_df3.reset_index(inplace=True)
aux_df3.columns = ['max_gust_speed_mph','mean_duration']

%matplotlib notebook

plt.figure(figsize = [9,7])

cx =sns.regplot(x=aux_df3['max_gust_speed_mph'], y=aux_df3['mean_duration'], data=aux_df3)
cx.set_title('Duracion promedio por maxima velocidad de las rafagas de viento')
plt.ylabel('Duracion promedio (minutos)')
plt.xlabel('Maxima velocidad de las rafagas de viento (mph)')

## Correlacion de precipitation inches y promedio de duracion en minutos para el trip mas popular

In [None]:
trip6965['precipitation_inches'] = pd.to_numeric(trip6965['precipitation_inches'], errors='coerce').fillna(0)

aux_df3 = trip6965.groupby('precipitation_inches')['duration_minutos'].mean().to_frame()

aux_df3.reset_index(inplace=True)
aux_df3.columns = ['precipitation_inches','mean_duration']

%matplotlib notebook

plt.figure(figsize = [9,7])

cx =sns.regplot(x=aux_df3['precipitation_inches'], y=aux_df3['mean_duration'], data=aux_df3)
cx.set_title('Duracion promedio y precipitacion')
plt.ylabel('Duracion promedio (minutos)')
plt.xlabel('Precipitation inches')

## Correlacion entre cloud cover y duracion promedio en minutos para el trip mas popular

In [None]:
aux_df3 = trip6965.groupby('cloud_cover')['duration_minutos'].mean().to_frame()

aux_df3.reset_index(inplace=True)
aux_df3.columns = ['cloud_cover','mean_duration']

%matplotlib notebook

plt.figure(figsize = [9,7])

cx =sns.regplot(x=aux_df3['cloud_cover'], y=aux_df3['mean_duration'], data=aux_df3)
cx.set_title('Duracion promedio y Cloud Cover')
plt.ylabel('Duracion promedio (minutos)')
plt.xlabel('Cloud Cover')

## Correlación entre las distintas variables del clima 

In [None]:
## Para los viajes que duran menos de un dia, me sigo quedando con el zip_code 94107
# analizo la relacion que existe entre las diferentes variables, mientras más oscuro, más 
# fuerte es la relación

fig, ax = plt.subplots(figsize=(16,5));        # Sample figsize in inches

cor = trips_weather_94107.loc[:,['duration','mean_temperature_f','mean_dew_point_f','mean_humidity','mean_visibility_miles','mean_wind_speed_mph']]\
        .corr().abs()
cor.values[[np.arange(5)]*2] = 0
g=sns.heatmap(cor);
g.set_title('Correlacion entre las distintas variables del clima y la duracion')
g.set_xticklabels(g.get_xticklabels(), rotation = 0)
g.set_yticklabels(g.get_yticklabels(), rotation = 45)

In [None]:
station = pd.read_csv('../input/sf-bay-area-bike-share/station.csv')

print(station.shape)

print(station.isnull().any())

print(station.describe())