#Hypothesis testing in Python

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

# load hourly data
hourly_data = pd.read_csv('https://raw.githubusercontent.com/PacktWorkshops/The-Data-Analysis-Workshop/master/Chapter01/data/hour.csv')

In [None]:
# print some generic statistics about the data
print(f"Shape of data: {hourly_data.shape}")
print(f"Number of missing values in the data: {hourly_data.isnull().sum().sum()}")

# get statistics on the numerical columns
hourly_data.describe()

Shape of data: (17379, 17)
Number of missing values in the data: 0


Unnamed: 0,instant,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0
mean,8690.0,2.50164,0.502561,6.537775,11.546752,0.02877,3.003683,0.682721,1.425283,0.496987,0.475775,0.627229,0.190098,35.676218,153.786869,189.463088
std,5017.0295,1.106918,0.500008,3.438776,6.914405,0.167165,2.005771,0.465431,0.639357,0.192556,0.17185,0.19293,0.12234,49.30503,151.357286,181.387599
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.02,0.0,0.0,0.0,0.0,0.0,1.0
25%,4345.5,2.0,0.0,4.0,6.0,0.0,1.0,0.0,1.0,0.34,0.3333,0.48,0.1045,4.0,34.0,40.0
50%,8690.0,3.0,1.0,7.0,12.0,0.0,3.0,1.0,1.0,0.5,0.4848,0.63,0.194,17.0,115.0,142.0
75%,13034.5,3.0,1.0,10.0,18.0,0.0,5.0,1.0,2.0,0.66,0.6212,0.78,0.2537,48.0,220.0,281.0
max,17379.0,4.0,1.0,12.0,23.0,1.0,6.0,1.0,4.0,1.0,1.0,1.0,0.8507,367.0,886.0,977.0


In [None]:
# create a copy of the original data
preprocessed_data = hourly_data.copy()

# tranform seasons
seasons_mapping = {1: 'winter', 2: 'spring', 3: 'summer', 4: 'fall'}
preprocessed_data['season'] = preprocessed_data['season'].apply(lambda x: seasons_mapping[x])

# transform yr
yr_mapping = {0: 2011, 1: 2012}
preprocessed_data['yr'] = preprocessed_data['yr'].apply(lambda x: yr_mapping[x])

# transform weekday
weekday_mapping = {0: 'Sunday', 1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 4: 'Thursday', 5: 'Friday', 6: 'Saturday'}
preprocessed_data['weekday'] = preprocessed_data['weekday'].apply(lambda x: weekday_mapping[x])

# transform weathersit
weather_mapping = {1: 'clear', 2: 'cloudy', 3: 'light_rain_snow', 4: 'heavy_rain_snow'}
preprocessed_data['weathersit'] = preprocessed_data['weathersit'].apply(lambda x: weather_mapping[x])

# transorm hum and windspeed
preprocessed_data['hum'] = preprocessed_data['hum']*100
preprocessed_data['windspeed'] = preprocessed_data['windspeed']*67

# visualize preprocessed columns
cols = ['season', 'yr', 'weekday', 'weathersit', 'hum', 'windspeed']
preprocessed_data[cols].sample(10, random_state=123)

Unnamed: 0,season,yr,weekday,weathersit,hum,windspeed
5792,summer,2011,Saturday,clear,74.0,8.9981
7823,fall,2011,Sunday,clear,43.0,31.0009
15426,fall,2012,Tuesday,cloudy,77.0,6.0032
15028,fall,2012,Sunday,clear,51.0,22.0028
12290,spring,2012,Friday,cloudy,89.0,12.998
3262,spring,2011,Friday,clear,64.0,7.0015
10763,spring,2012,Thursday,clear,42.0,23.9994
12384,spring,2012,Tuesday,light_rain_snow,82.0,11.0014
6051,summer,2011,Wednesday,clear,52.0,19.0012
948,winter,2011,Saturday,clear,80.0,0.0


In [None]:
# assert that total numer of rides is equal to the sum of registered and casual ones
assert (preprocessed_data.casual + preprocessed_data.registered == preprocessed_data.cnt).all(), \
'Sum of casual and registered rides not equal to total number of rides'
preprocessed_data

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,winter,2011,1,0,0,Saturday,0,clear,0.24,0.2879,81.0,0.0000,3,13,16
1,2,2011-01-01,winter,2011,1,1,0,Saturday,0,clear,0.22,0.2727,80.0,0.0000,8,32,40
2,3,2011-01-01,winter,2011,1,2,0,Saturday,0,clear,0.22,0.2727,80.0,0.0000,5,27,32
3,4,2011-01-01,winter,2011,1,3,0,Saturday,0,clear,0.24,0.2879,75.0,0.0000,3,10,13
4,5,2011-01-01,winter,2011,1,4,0,Saturday,0,clear,0.24,0.2879,75.0,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,17375,2012-12-31,winter,2012,12,19,0,Monday,1,cloudy,0.26,0.2576,60.0,11.0014,11,108,119
17375,17376,2012-12-31,winter,2012,12,20,0,Monday,1,cloudy,0.26,0.2576,60.0,11.0014,8,81,89
17376,17377,2012-12-31,winter,2012,12,21,0,Monday,1,clear,0.26,0.2576,60.0,11.0014,7,83,90
17377,17378,2012-12-31,winter,2012,12,22,0,Monday,1,clear,0.26,0.2727,56.0,8.9981,13,48,61


Hypothesis #3 Detecting whether if there's a statistical difference between registered rides in clear wheather and other weather conditions

Null hypothesis: proportion(clears/all) = 0.25, since there are 4 weather conditions, and 0.25 means that number of rides is independent on weather.


In [None]:
clearDays= preprocessed_data.weathersit=='clear'
clears = preprocessed_data[clearDays]
otherWeather= preprocessed_data.weathersit!='clear'
others = preprocessed_data[otherWeather]

In [None]:
clears.describe()

Unnamed: 0,instant,yr,mnth,hr,holiday,workingday,temp,atemp,hum,windspeed,casual,registered,cnt
count,11413.0,11413.0,11413.0,11413.0,11413.0,11413.0,11413.0,11413.0,11413.0,11413.0,11413.0,11413.0,11413.0
mean,8696.750723,2011.505389,6.511872,11.731709,0.029878,0.668711,0.511054,0.488387,57.37466,12.767059,40.545431,164.323841,204.869272
std,4910.639765,0.499993,3.335956,7.015996,0.170259,0.470697,0.200775,0.178958,17.846308,8.293805,53.126559,156.890453,189.487773
min,1.0,2011.0,1.0,0.0,0.0,0.0,0.02,0.0,8.0,0.0,0.0,0.0,1.0
25%,4494.0,2011.0,4.0,6.0,0.0,0.0,0.34,0.3333,43.0,7.0015,5.0,39.0,46.0
50%,8715.0,2012.0,7.0,12.0,0.0,1.0,0.52,0.5,56.0,12.998,20.0,128.0,159.0
75%,12956.0,2012.0,9.0,18.0,0.0,1.0,0.68,0.6364,71.0,16.9979,56.0,232.0,304.0
max,17379.0,2012.0,12.0,23.0,1.0,1.0,1.0,1.0,100.0,54.002,367.0,886.0,977.0


In [None]:
clears_p = clears.cnt.sum()
clears_p

2338173

In [None]:
all_p = preprocessed_data.cnt.sum()
all_p

3292679

In [None]:
proportion = clears_p/all_p
proportion

0.7101126468750826

it is clear that number of rides is greater at clearer weather, so we reject the null hypothesis.