In [None]:
#Importing libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output

In [None]:
%matplotlib inline
import random
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import statistics
import numpy as np
import scipy
from scipy import stats
import seaborn
import warnings
warnings.filterwarnings(action="ignore")

In [None]:
data = pd.read_csv('../input/cycle-share-dataset/trip.csv',error_bad_lines=False)
data.head()

In [None]:
len(data)

In [None]:
data = data.sort_values(by='starttime')
data.reset_index()
print('Date range of dataset: %s - %s' %(data.ix[1, 'starttime'], data.ix[len(data)-1, 'stoptime']))

**UNIVARIATE ANALYSIS**


Plotting Distribution by User-Type

In [None]:
groupby_user = data.groupby('usertype').size()
groupby_user.plot.bar(title='Distribution of user types ')

Above observation states that the members are using more than that of short tem pass holders

In [None]:
groupby_gender = data.groupby('gender').size()
groupby_gender.plot.bar(title='Distribution of Genders ')

 Males seem to
dominate the trips taken as part of the program

In [None]:
#Plotting the distribution of birth years
data = data.sort_values(by='birthyear')
groupby_birthyear = data.groupby('birthyear').size()
groupby_birthyear.plot.bar(title='Distribution of birth years',
figsize = (15,4))

 Majority of the people who had
subscribed to this program born in the early 1980s to mid
to late 1990s, also known as millennials.

In [None]:
#Plotting the frequency of memeber types for millenials
data_mil = data[(data['birthyear']>=1977) & (data['birthyear']<=1994)]
groupby_mil = data_mil.groupby('usertype').size()
groupby_mil.plot.bar(title='Distribution of user types')

**Multivariate Analysis**

In [None]:
groupby_birthyear_gender = data.groupby(['birthyear','gender'])['birthyear'].count().unstack('gender').fillna(0)
groupby_birthyear_gender[['Male','Female','Other']].plot.bar(title='Distribution of birth years by Gender', stacked=True,
figsize=(15,4))


We at first transformed the data frame by unstacking, that is, splitting,
the gender column into three columns, that is, Male, Female, and Other.
This meant that for each of the birth years we had the trip count for all
three gender types. Finally, a stacked bar graph was created by using this
transformed data frame.

It seemed as if males were dominating the distribution. It made sense as well. No?
Well, it did; as seen earlier, that majority of the trips were availed by males, hence this
skewed the distribution in favor of males. However, subscribers born in 1947 were all
females. Moreover, those born in 1964 and 1994 were dominated by females as well.

In [None]:
#Plotting the distribution of birthyears by user types
groupby_birthyear_users = data.groupby(['birthyear','usertype'])['birthyear'].count().unstack('usertype').fillna(0)
groupby_birthyear_users['Member'].plot.bar(title='Distribution of birth years by User Types', stacked=True,
figsize=(15,4))


In [None]:
#Validation if we dont have birth year available for short term pass holders
data[data['usertype'] =='Short-Term Pass Holder']['birthyear'].isnull().values.all()

First sliced the data frame to consider only the short term pass holders. Considering the fact that whether all the values in the birth year are missing for this sliced . This states that the birth year date is only available for the members. Members have to provide details like birth year when applying for the membership,
something which is not a prerequisite for short-term pass holders

In [None]:
#Validation If We Don’t Have Gender Available for Short-Term Pass Holders
data[data['usertype']=='Short-Term Pass Holder']['gender'].isnull().values.all()

In [None]:
#Now Time Series Analaysis for the trips 
List_ = list(data['starttime'])
List_ = [datetime.datetime.strptime(x, "%m/%d/%Y %H:%M") for x in List_]
data['starttime_mod'] = pd.Series(List_,index=data.index)
data['starttime_date'] = pd.Series([x.date() for x in List_],index=data.index)
data['starttime_year'] = pd.Series([x.year for x in List_],index=data.index)
data['starttime_month'] = pd.Series([x.month for x in List_],index=data.index)
data['starttime_day'] = pd.Series([x.day for x in List_],index=data.index)
data['starttime_hour'] = pd.Series([x.hour for x in List_],index=data.index)

data.groupby('starttime_date')['tripduration'].mean().plot.bar(title =
'Distribution of Trip duration by date', figsize = (15,4))

Since the pattern is repeating over a fixed interval of time we can apply seasonal time analysis over here


In [None]:
trip_duration = list(data['tripduration'])
station_from = list(data['from_station_name'])
print('Mean of trip duration: %f'%statistics.mean(trip_duration))
print('Median of trip duration: %f'%statistics.median(trip_duration))
print('Mode of station originating from: %s'%statistics.mode(station_from))

The output of Listing revealed that most trips originated from Pier 69/Alaskan
Way & Clay St station. Hence this was the ideal location for running promotional
campaigns targeted to existing customers. Moreover, the output showed the mean to
be greater than that of the median. 

In [None]:
data['tripduration'].plot.hist(bins=100, title='Frequency distribution of Trip duration')
plt.show()

 The distribution is
not symmetric and has majority of values toward the right-hand side of the mode. These
extreme values toward the right are negligible in quantity, but their extreme nature tends
to pull the mean toward themselves. Thus the reason why the mean is greater than the
median.

In [None]:
box = data.boxplot(column=['tripduration'])
plt.show()

The huge number of outliers are due to trip duration

In [None]:
q75, q25 = np.percentile(trip_duration, [75,25])
iqr = q75 - q25
print('Proportion of values as outlier: %f percent'%((len(data) - len([x for x in trip_duration if q75 +(1.5*iqr)>=x>=q25-(1.5*iqr)]))*100/float(len(data))))

Only 9.42% of the values within the dataset to be outliers.
Considering the time series nature of the dataset we knew that removing these outliers
wouldn’t be an option. Hence the only option we could rely on was to
apply transformation to these outliers to negate their extreme nature. However,
interested in observing the mean of the non-outlier values of trip duration. 

In [None]:
mean_trip_duration = np.mean([x for x in trip_duration if q75 +(1.5*iqr)>=x>= q25-(1.5*iqr)])
upper_whisker = q75+(1.5*iqr)
print('Mean of trip duration: %f'%mean_trip_duration)



In [None]:
def transform_tripduration(x):
    if x > upper_whisker:
        return mean_trip_duration
    return x
data['tripduration_mean']=data['tripduration'].apply(lambda x: transform_tripduration(x))
data['tripduration_mean'].plot.hist(bins=100, title='Frequency distribution of mean Transformed Trip Duration')
plt.show()


In [None]:
print('Mean of trip duration: %f'%data['tripduration_mean'].mean())
print('Standard deviation of trip duration: %f'%data['tripduration_mean'].std())
print('Median of trip duration: %f'%data['tripduration_mean'].median())


**Correlation**

In [None]:
pd.set_option('display.width', 100)
pd.set_option('precision', 3)
data['age'] = data['starttime_year'] - data['birthyear']
correlations = data[['tripduration','age']].corr(method='pearson')
print(correlations)

In [None]:
for cat in ['gender','usertype']:
    print('Category:%s\n'%cat)
    groupby_category = data.groupby(['starttime_date', cat])['starttime_date'].count().unstack(cat)
    groupby_category = groupby_category.dropna()
    category_names = list(groupby_category.columns)

    for comb in [(category_names[i],category_names[j]) for i in range(len(category_names)) for j in range(i+1, len(category_names))]:
        
        

        print('%s %s'%(comb[0], comb[1]))
        t_statistics = stats.ttest_ind(list(groupby_category[comb[0]]),list(groupby_category[comb[1]]))
        print('Statistic: %f, P value: %f'%(t_statistics.statistic,t_statistics.pvalue))
        print('\n')


Central Limit Theorem

In [None]:
daily_tickets = list(data.groupby('starttime_date').size())
sample_tickets=[]
checkpoints = [1,10,100,300,500,1000]
plot_count=1
random.shuffle(daily_tickets)
plt.figure(figsize=(15,7))
binrange = np.array(np.linspace(0,700,101))

for i in range(1000):
    if daily_tickets:
        sample_tickets.append(daily_tickets.pop())
    if i+1 in checkpoints or not daily_tickets:
        plt.subplot(2,3,plot_count)
        plt.hist(sample_tickets,binrange)
        plt.title('n=%d' %(i+1),fontsize=15)
        plot_count+=1
    if not daily_tickets:
        break
        

plt.show()
        

*****FINAL CONCLUSION*****
 The insights were rendered on data collected from 2014
to 2016 with demographic information only available for the members and not shortterm pass holders. Hence, in order to get information about the short-term pass holders
we need to go through a market research exercise where central limit
theorem would come in handy. Trip duration follows a definite seasonal pattern that
repeats over time. Forecasting this time series can help me to predict the times when
the company needs to push its marketing efforts and times when most trips anticipated
can help ensure operational efficiencies. As for the promotions, 
the best station at which to kick off the campaign would be Pier 69/Alaskan Way & Clay
St. Outliers were a tiny portion of the dataset; however, their time series nature meant
that those outliers couldn’t be removed and transformation was thus applied. Regarding
further analysis, the features are not homogeneous, the
analysis would have to be done on the individual category level