In [None]:
# Activity: Explore hypothesis testing

import numpy as np
import pandas as pd
from scipy import stats

In [None]:
aqi = pd.read_csv("c4_epa_air_quality.csv")

In [None]:
aqi.describe()

Unnamed: 0.1,Unnamed: 0,arithmetic_mean,aqi
count,260.0,260.0,260.0
mean,129.5,0.403169,6.757692
std,75.199734,0.317902,7.061707
min,0.0,0.0,0.0
25%,64.75,0.2,2.0
50%,129.5,0.276315,5.0
75%,194.25,0.516009,9.0
max,259.0,1.921053,50.0


In [None]:
aqi.columns

Index(['Unnamed: 0', 'date_local', 'state_name', 'county_name', 'city_name',
       'local_site_name', 'parameter_name', 'units_of_measure',
       'arithmetic_mean', 'aqi'],
      dtype='object')

In [None]:
aqi["county_name"].unique()

array(['Maricopa', 'Belmont', 'Teton', 'Philadelphia', 'Polk', 'Honolulu',
       'Erie', 'Larimer', 'Dakota', 'Marion', 'Hamilton', 'Jackson',
       'Hartford', 'San Bernardino', 'Washoe', 'Sacramento', 'Pima',
       'Roanoke', 'Linn', 'Stark', 'Providence', 'Harris', 'Clark',
       'Orange', 'Alameda', 'Cook', 'Wyandotte', 'Baltimore', 'Hennepin',
       'Bergen', 'Los Angeles', 'Fresno', 'Anoka', 'Suffolk',
       'Cumberland', 'Ada', 'San Mateo', 'La Plata', 'Contra Costa',
       'Arlington', 'Butte', 'Riverside', 'Chittenden', 'Hillsborough',
       'Shelby', 'DeKalb', 'Summit', 'Hampden', 'Minnehaha',
       'Mecklenburg', 'Kanawha', 'Denver', 'Essex', 'Kern', 'Cuyahoga',
       'Scott', 'Allegheny', 'Wayne', 'Douglas', 'St. Louis City',
       'Jefferson', 'King', 'Wake', 'San Joaquin', 'Garrett',
       'Santa Barbara', 'Laramie', 'Edmonson', 'Tulsa', 'Santa Clara',
       'Humboldt', 'Union', 'Howard', 'El Paso', 'Oklahoma', 'Pinellas',
       'Mesa', 'Adams', 'New Haven',

In [None]:
la = aqi[aqi["county_name"] == "Los Angeles"]
not_la = aqi[aqi["county_name"] != "Los Angeles"]

In [None]:
la.shape

(14, 10)

In [None]:
not_la.shape

(246, 10)

In [None]:
sample_la = la.sample(n=10, replace=False, random_state=42)

In [None]:
sample_not_la = not_la.sample(n=10, replace=False, random_state=42)

In [None]:
sample_la.columns

Index(['Unnamed: 0', 'date_local', 'state_name', 'county_name', 'city_name',
       'local_site_name', 'parameter_name', 'units_of_measure',
       'arithmetic_mean', 'aqi'],
      dtype='object')

In [None]:
stats.ttest_ind(a=sample_la["aqi"], b=sample_not_la["aqi"], equal_var=False)

TtestResult(statistic=np.float64(1.8187954468925667), pvalue=np.float64(0.08876287834333375), df=np.float64(15.150720568684816))

In [None]:
# the p value is 0.0887 so it's 8.87% which means that, Since the p-value (0.0887) is greater than the significance level (0.05):

# You fail to reject the null hypothesis.
# This means there isn't enough evidence to conclude that the observed result is statistically significant at the 5% significance level.
# In simple terms, the data doesn't provide strong enough evidence to reject the null hypothesis, meaning the results could have occurred by random chance under the assumption that the null hypothesis is true.

# you fail to reject the null hypothesis, so null hypothesis is true.

In [None]:
# #Hypothesis 2: With limited resources, ROA has to choose between New York
# and Ohio for their next regional office. Does New York have a lower AQI than
# Ohio?

In [None]:
aqi["state_name"].unique()

array(['Arizona', 'Ohio', 'Wyoming', 'Pennsylvania', 'Iowa', 'Hawaii',
       'Colorado', 'Minnesota', 'Indiana', 'Missouri', 'Connecticut',
       'California', 'Nevada', 'Virginia', 'Rhode Island', 'Texas',
       'Illinois', 'Kansas', 'Maryland', 'New Jersey', 'Massachusetts',
       'Maine', 'Idaho', 'Florida', 'Vermont', 'Tennessee', 'Georgia',
       'South Dakota', 'North Carolina', 'West Virginia', 'Michigan',
       'Nebraska', 'Alabama', 'Washington', 'Kentucky', 'Oklahoma',
       'New York', 'North Dakota', 'Montana', 'Utah', 'Delaware',
       'New Hampshire', 'Louisiana', 'Mississippi', 'New Mexico',
       'Oregon', 'District Of Columbia', 'Arkansas', 'Puerto Rico',
       'South Carolina', 'Alaska', 'Wisconsin'], dtype=object)

In [None]:
ny = aqi[aqi["state_name"] == "New York"]
ohio = aqi[aqi["state_name"] == "Ohio"]

In [None]:
ny.shape

(10, 10)

In [None]:
ohio.shape

(12, 10)

In [None]:
sample_ny = ny.sample(n=50, replace=True, random_state=42)
sample_ohio = ohio.sample(n=50, replace=True, random_state=42)

In [None]:
stats.ttest_ind(a=sample_ny["aqi"], b=sample_ohio["aqi"], equal_var=False)

TtestResult(statistic=np.float64(-3.9130296393824104), pvalue=np.float64(0.0002210681483284815), df=np.float64(64.86787455319335))

In [None]:
# since the pvalue is less than 5% so we can accept the null hypothesis that The mean AQI of New York is greater than or equal to that of Ohio.

In [None]:
# Hypothesis 3: A new policy will affect those states with a mean AQI of 10 or
# greater. Will Michigan be affected by this new policy?

In [None]:
mich = aqi[aqi["state_name"] == "Michigan"]

In [None]:
sample_mich = mich.sample(n=50, replace=True)

In [None]:
stats.ttest_1samp(a=sample_mich["aqi"].mean(), b=10, alternative="greater")

TypeError: ttest_1samp() got an unexpected keyword argument 'b'

In [None]:
# check documentation for 1sample. https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_1samp.html

In [None]:
stats.ttest_1samp(sample_mich["aqi"], 10, alternative="greater")

TtestResult(statistic=np.float64(-4.016733292299032), pvalue=np.float64(0.9998987958539883), df=np.int64(49))

In [None]:
# here p value is 99.98% so we failed to reject null hypothesis which means that the mean AQI of Michigan is NOT less than or equal to 10.