In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress

In [None]:
# import cleaned data for diabetes and fastfood per county(FIPS) and year
diabetes_fastfood = 'diab_fastfood_clean_df.csv'
diabetes_fastfood_pd = pd.read_csv(diabetes_fastfood)

diabetes_fastfood_pd.head()

In [None]:
# Time for some exploration

county_stats = diabetes_fastfood_pd.groupby("County")
county_stats.mean()

In [None]:
# Spot check some counties
check_county = diabetes_fastfood_pd.loc[diabetes_fastfood_pd["County"].isin(['DuPage', 'Will', 'Lake']), :]

check_county

In [None]:
# plot all counties by year for top populated counties
top_counties_dp = diabetes_fastfood_pd.nlargest(10, 'Population')
top_counties_dp 

In [None]:
top_counties = top_counties_dp["County"].unique()
top_counties

In [None]:
county_by_year = pd.DataFrame()

# plt.plot(years, fast_food)


for county in top_counties:
    county_by_year = top_counties_dp.loc[top_counties_dp["County"] == county, :]

    years = county_by_year["Year"]
    fast_food = county_by_year["Restaurant Count"]
    plt.plot(years, fast_food, label=county, marker='o')
plt.legend()
plt.show()


In [None]:
# select certain counties excluding Cook to see the progress of restaurants over the years
top_counties = check_county["County"].unique()
county_by_year = pd.DataFrame()

# plt.plot(years, fast_food)


for county in top_counties:
    county_by_year = check_county.loc[check_county["County"] == county, :]

    years = county_by_year["Year"]
    fast_food = county_by_year["Restaurant Count"]
    plt.plot(years, fast_food, label=county, marker='o')
plt.legend()
plt.show()

In [None]:
# This shows that there is a disconnect between data from 2011-2016 and the data for 2021 and that can be explained by:
# 1 - different methodologies
# 2 - yelp api rate limit
# 3 - covid might have caused fast food restaurants to shut down or not be opened at the same rate

In [None]:
# add bar chart for diabetes per year
diabetes_fastfood_pd.head()

In [None]:
# x- axis is Year, y-axis is % Diabetic
diabetic_progress = diabetes_fastfood_pd.groupby("Year")
diabetic_progress_dp = diabetic_progress.sum()
diabetic_progress_dp

In [None]:

plt.figure(figsize=(20,3))
diabetic_progress_dp["Years"] = diabetic_progress_dp.index
diabetic_progress_dp = diabetic_progress_dp.reset_index(drop=True)
years = diabetic_progress_dp["Years"]

In [None]:
plt.plot(years, diabetic_progress_dp["Diabetic Population"])
# add labels and limits-----------------------------

In [None]:
# add column for fast food per capita
diabetes_fastfood_pd
diabetes_fastfood_pd["Restaurant per Capita"] = \
                    diabetes_fastfood_pd["Restaurant Count"]/(diabetes_fastfood_pd["Population"]/1000)
diabetes_fastfood_pd

In [None]:
# plot scatter for fast food per capita on x and % diabetes on y, a chart for each year
# 2011


for year in years:
    year_data = diabetes_fastfood_pd.loc[diabetes_fastfood_pd["Year"] == year, :]
    restaurants = year_data["Restaurant per Capita"]
    diabetes = year_data["% Diabetic"]
    plt.scatter(restaurants,diabetes)
    plt.title(year)
    # add labels and titles
    plt.show()


In [None]:
# Linear regression for all years
x_values = diabetes_fastfood_pd['Restaurant per Capita']
y_values = diabetes_fastfood_pd['% Diabetic']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(0.7,17),fontsize=15,color="red")
plt.xlabel('Restaurant per Capita')
plt.ylabel('% Diabetic')
print(f"The r-squared is: {rvalue**2}")
plt.show()