In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress
import scipy.stats as stats

In [None]:
# import cleaned data for diabetes and fastfood per county(FIPS) and year
diabetes_fastfood = 'diab_fastfood_clean_df.csv'
diabetes_fastfood_pd = pd.read_csv(diabetes_fastfood)

diabetes_fastfood_df = diabetes_fastfood_pd[ [ "FIPS", "State", "County", "% Diabetic", "Population", "Year", "Diabetic Population", "Restaurant Count"] ]
diabetes_fastfood_df.head()

In [None]:
# Now making some plots and calcs for some exploration

county_stats = diabetes_fastfood_df.groupby("County")
county_stats.mean()

In [None]:
# Spot check some counties
check_county = diabetes_fastfood_df.loc[diabetes_fastfood_df["County"].isin(['DuPage', 'Will', 'Lake']), :]

check_county

In [None]:
# plot all counties by year for top populated counties
top_counties_dp = diabetes_fastfood_df.nlargest(12, 'Population')
top_counties_dp 

In [None]:
top_counties = top_counties_dp["County"].unique()
top_counties

county_by_year = pd.DataFrame()

# plt.plot(years, fast_food)


for county in top_counties:
    county_by_year = top_counties_dp.loc[top_counties_dp["County"] == county, :]

    years = county_by_year["Year"]
    fast_food = county_by_year["Restaurant Count"]
    plt.plot(years, fast_food, label=county, marker='o')
plt.legend()
plt.show()


In [None]:
# select certain counties excluding Cook to see the progress of restaurants over the years
top_counties = check_county["County"].unique()
county_by_year = pd.DataFrame()

# plt.plot(years, fast_food)


for county in top_counties:
    county_by_year = check_county.loc[check_county["County"] == county, :]

    years = county_by_year["Year"]
    fast_food = county_by_year["Restaurant Count"]
    plt.plot(years, fast_food, label=county, marker='o')
plt.legend()
plt.show()

In [None]:
# Check the population increase over years for Cook

cook_county = diabetes_fastfood_df.loc[diabetes_fastfood_df["County"] == 'Cook', :]
years = cook_county["Year"]
population = cook_county["Population"]
plt.plot(years, population, label="Cook", marker='o')
plt.legend()
plt.show()

In [None]:
# Do a group by to see sum by year
diabetes_fastfood_group_year = diabetes_fastfood_df.groupby("Year")
diabetes_fastfood_group_year_df = diabetes_fastfood_group_year.sum()
diabetes_fastfood_group_year_df.head()

In [None]:
# Add a column for Diabetics per 1000 Restaurants
diabetes_fastfood_group_year_df["Diabetics per 1000 Restaurants"] = \
                    diabetes_fastfood_group_year_df["Diabetic Population"]/(diabetes_fastfood_group_year_df["Population"]/1000)
diabetes_fastfood_group_year_df.head()

In [None]:
# create some lists for 'by year' plotting purposes

years = diabetes_fastfood_group_year_df.index.tolist()
restaurant_count = diabetes_fastfood_group_year_df["Restaurant Count"].tolist()
diabetic_population = diabetes_fastfood_group_year_df["Diabetic Population"].tolist()
diabetics_per_1000_restaturants = diabetes_fastfood_group_year_df["Diabetics per 1000 Restaurants"].tolist()

In [None]:
# Make bar chart showing total Illinois fast food restaurant count by year

plt.bar(years, restaurant_count, color='blue', alpha=0.5, align="center", width=3)
plt.title("Total Illinois Fast Food Restaurant Count by Year")
#plt.grid()
plt.xlabel("Year")
plt.ylabel("Fast Food Restaurants")

print("This plot shows a significant drop of in total fast food restaurant count in 2021")

# This shows that there is a disconnect between data from 2011-2016 and the data for 2021 and that can be explained by:
# 1 - different methodologies between www.ers.usda.gov and yelp for defining fast food restaurants and/or missing yelp reviews
# 2 - yelp api rate limit
# 3 - covid might have caused fast food restaurants to shut down or not be opened at the same rate

In [None]:
# Doing a few more plots for exploration by year

# Make bar chart showing diabetic population per fast food restaurant count by year

plt.bar(years, diabetics_per_1000_restaturants, color='blue', alpha=0.5, align="center", width=3)
plt.title("Diabetics per 1000 by Year")

plt.xlabel("Year")
plt.ylabel("Diabetic Population per 1000")



In [None]:
# add line chart for total diabetes population per year
# x- axis is Year, y-axis is % Diabetic
plt.plot(years, diabetic_population)

plt.title("Total Illinois Diabetic Population by Year")
plt.grid()
plt.xlabel("Year")
plt.ylabel("Diabetic Populations")

In [None]:
# add column to the original dataframe, for fast food per capita (number of restaurants per 1000 people)
diabetes_fastfood_df["Restaurant per Capita"] = \
                    diabetes_fastfood_df["Restaurant Count"]/(diabetes_fastfood_df["Population"]/1000)
diabetes_fastfood_df.head()

In [None]:
# creatte scatter plot for fast food per capita on x and % diabetes on y, a chart for each year
# 2011


for year in years:
    year_data = diabetes_fastfood_df.loc[diabetes_fastfood_df["Year"] == year, :]
    restaurants = year_data["Restaurant per Capita"]
    diabetes = year_data["% Diabetic"]
    plt.scatter(restaurants,diabetes)
    plt.title(year)
 
    plt.grid()
    plt.ylabel("% Diabetic")
    plt.xlabel("Restaurants per Capita")
    plt.show()


In [None]:
# Scatter plot with Linear regression for the combined (all years) dataset
x_values = diabetes_fastfood_df['Restaurant per Capita']
y_values = diabetes_fastfood_df['% Diabetic']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(0.7,17),fontsize=15,color="red")
plt.xlabel('Restaurant per Capita')
plt.ylabel('% Diabetic')
plt.title("% Diabetic vs. Restaurants per Capita, All Years")
print(f"The r-squared is: {rvalue**2}")
plt.show()

print("The low r-squared value indicates that variation in the % diabetic population\n  is not very well explained by the restaurants per capita in Illinois")

In [None]:
# -----------------------------------------------------------------------------------------Hai's queue!!!
diabetes_fastfood_df.head()

In [None]:
# Now we want to perform a hypothesis test
# Our hypothesis is that the number of fast food restaurants per capita in Illinois has an effect on 
# the percentage of the population with diabetes

# The null hypothesis is:  The number of fast food restaurants per capita in Illinois has NO effect on 
# the percentage of the population with diabetes

# We did two tests:  First an independent T-test, then an ANOVA

In [None]:
# Set up a sample of restaurants that have a low number of restaurants per capital
#  We chose Restaurants per Capita < 0.5

lower_rest_per_head_df = diabetes_fastfood_df.loc[diabetes_fastfood_df["Restaurant per Capita"] < 0.5,:]
lower_rest_per_head_df

In [None]:
higher_rest_per_head_df = diabetes_fastfood_df.loc[diabetes_fastfood_df["Restaurant per Capita"] >= 0.5,:]
higher_rest_per_head_df

In [None]:
lower_restaurant_group = lower_rest_per_head_df["% Diabetic"]
lower_restaurant_group

In [None]:
higher_restaurant_group = higher_rest_per_head_df["% Diabetic"]
higher_restaurant_group

In [None]:
# Do independent sample T-Test

stats.ttest_ind(lower_restaurant_group, higher_restaurant_group, axis=0, equal_var=False)

In [None]:
print("The pvalue is > 0.5, meaning we cannot reject the null hypothesis that number of fast food restaurants \n per capita in Illinois has NO effect on the percentage of the population with diabetes")

In [None]:
    # Doing a Histogram Plot of Data to see the overlap of the two groups
    plt.subplot(2, 1, 2)
    plt.hist(lower_restaurant_group, 10, density=True, alpha=0.7, label="lower_diabetic_mean")
    plt.hist(higher_restaurant_group, 10, density=True, alpha=0.7, label="higher_diabetic_mean")
    plt.axvline(lower_restaurant_group.mean(), color='k', linestyle='dashed', linewidth=1)
    plt.axvline(higher_restaurant_group.mean(), color='k', linestyle='dashed', linewidth=1)
    plt.legend()  

In [None]:
# Now on Dom's advice we are trying an ANOVA using three groups based on the Restaurant per Capita
# First assigning buckets of three roughly equal groups of low (<0.44), medium (0.44 <= med < 0.62), and high (>= 0.62)

low_rest_per_head_df = diabetes_fastfood_df.loc[diabetes_fastfood_df["Restaurant per Capita"] < 0.44,:]
low_rest_per_head_df["Rest_per_Capita_Bucket"] = "Low"
low_rest_per_head_df.head()

In [None]:
med_rest_per_head_df = diabetes_fastfood_df.loc[((diabetes_fastfood_df["Restaurant per Capita"] >= 0.44) & (diabetes_fastfood_df["Restaurant per Capita"]<.62)) ,:]
med_rest_per_head_df["Rest_per_Capita_Bucket"] = "Med"
med_rest_per_head_df.head()

In [None]:
high_rest_per_head_df = diabetes_fastfood_df.loc[diabetes_fastfood_df["Restaurant per Capita"] >= 0.62,:]
high_rest_per_head_df["Rest_per_Capita_Bucket"] = "High"
high_rest_per_head_df.head()

In [None]:
# Now performing the ANOVA

stats.f_oneway(low_rest_per_head_df["% Diabetic"], med_rest_per_head_df["% Diabetic"], high_rest_per_head_df["% Diabetic"])

In [None]:
print("The pvalue is less than 0.05, meaning we reject the null hypothesis using this test")

In [None]:
# Now doing some analysis to see which of the three groups is different than the others

low_rest_per_head_df["% Diabetic"].mean()

In [None]:
med_rest_per_head_df["% Diabetic"].mean()

In [None]:
high_rest_per_head_df["% Diabetic"].mean()

In [None]:
# It looks like the high group is the most out of line.  Trying another Anova without it

stats.f_oneway(low_rest_per_head_df["% Diabetic"], med_rest_per_head_df["% Diabetic"])

In [None]:
print("There is not a statistically significant difference between the low group and medium group.")

In [None]:
stats.f_oneway(low_rest_per_head_df["% Diabetic"], high_rest_per_head_df["% Diabetic"])

In [None]:
print("There is a statistically significant difference between the low group and high group.")

In [None]:
stats.f_oneway(med_rest_per_head_df["% Diabetic"], high_rest_per_head_df["% Diabetic"])

In [None]:
print("There is not a statistically significant difference between the medium group and high group. (barely)")

In [None]:
# Finally, want to do a box plot to visually see the differences between the three groups

In [None]:
# concatenate the three bucket dataframes back together
low_med_df = pd.concat([low_rest_per_head_df, med_rest_per_head_df], ignore_index=True)
low_med_high_df = pd.concat([low_med_df, high_rest_per_head_df], ignore_index=True)
low_med_high_df

In [None]:
# perform the box plot
low_med_high_df.boxplot('% Diabetic', by='Rest_per_Capita_Bucket', figsize=(20, 10))
plt.xlabel("Restaurant per Capita Bucket")
plt.ylabel("% Diabetic")