In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress

In [None]:
# import cleaned data for diabetes and fastfood per county(FIPS) and year
diabetes_fastfood = 'diab_fastfood_clean_df.csv'
diabetes_fastfood_pd = pd.read_csv(diabetes_fastfood)

diabetes_fastfood_pd.head()

In [None]:
diabetes_fastfood_df = diabetes_fastfood_pd[ [ "FIPS", "State", "County", "% Diabetic", "Population", "Year", "Diabetic Population", "Restaurant Count"] ]
diabetes_fastfood_df.head()

In [None]:
# Time for some exploration

county_stats = diabetes_fastfood_df.groupby("County")
county_stats.mean()

In [None]:
# Spot check some counties
check_county = diabetes_fastfood_df.loc[diabetes_fastfood_df["County"].isin(['DuPage', 'Will', 'Lake']), :]

check_county

In [None]:
# plot all counties by year for top populated counties
top_counties_dp = diabetes_fastfood_df.nlargest(10, 'Population')
top_counties_dp 

In [None]:
top_counties = top_counties_dp["County"].unique()
top_counties

county_by_year = pd.DataFrame()

# plt.plot(years, fast_food)


for county in top_counties:
    county_by_year = top_counties_dp.loc[top_counties_dp["County"] == county, :]

    years = county_by_year["Year"]
    fast_food = county_by_year["Restaurant Count"]
    plt.plot(years, fast_food, label=county, marker='o')
plt.legend()
plt.show()


In [None]:
# select certain counties excluding Cook to see the progress of restaurants over the years
top_counties = check_county["County"].unique()
county_by_year = pd.DataFrame()

# plt.plot(years, fast_food)


for county in top_counties:
    county_by_year = check_county.loc[check_county["County"] == county, :]

    years = county_by_year["Year"]
    fast_food = county_by_year["Restaurant Count"]
    plt.plot(years, fast_food, label=county, marker='o')
plt.legend()
plt.show()

In [None]:
diabetes_fastfood_df.head()

In [None]:
diabetes_fastfood_group_year = diabetes_fastfood_df.groupby("Year")
diabetes_fastfood_group_year_df = diabetes_fastfood_group_year.sum()

In [None]:
diabetes_fastfood_group_year = diabetes_fastfood_df.groupby("Year")

diabetes_fastfood_group_year_df.head()

In [None]:
diabetes_fastfood_group_year_df["Diabetics per 1000 Restaurants"] = \
                    diabetes_fastfood_group_year_df["Diabetic Population"]/(diabetes_fastfood_group_year_df["Population"]/1000)
diabetes_fastfood_group_year_df.head()

In [None]:
# create some lists for 'by year' plotting purposes

years = diabetes_fastfood_group_year_df.index.tolist()
restaurant_count = diabetes_fastfood_group_year_df["Restaurant Count"].tolist()
diabetic_population = diabetes_fastfood_group_year_df["Diabetic Population"].tolist()
diabetics_per_1000_restaturants = diabetes_fastfood_group_year_df["Diabetics per 1000 Restaurants"].tolist()

In [None]:
# Make bar chart showing total Illinois fast food restaurant count by year

plt.bar(years, restaurant_count, color='blue', alpha=0.5, align="center", width=3)
plt.title("Total Illinois Fast Food Restaurant Count by Year")
#plt.grid()
plt.xlabel("Year")
plt.ylabel("Fast Food Restaurants")

print("This shows the drop of in total fast food restaurant count in 2021")

In [None]:
# Make bar chart showing diabetic population per fast food restaurant count by year

plt.bar(years, diabetics_per_1000_restaturants, color='blue', alpha=0.5, align="center", width=3)
plt.title("Diabetics per 1000 Fast Food Restaurants by Year")
#plt.grid()
plt.xlabel("Year")
plt.ylabel("Diabetic Population per 1000 Restaurants")

print("This plot shows the a potential correlation with diabetic population and restaurant count \n but it could be due to the restaurant decrease in the 2021 data.")

In [None]:
# This shows that there is a disconnect between data from 2011-2016 and the data for 2021 and that can be explained by:
# 1 - different methodologies between www.ers.usda.gov and yelp for defining fast food restaurants and/or missing yelp reviews
# 2 - yelp api rate limit
# 3 - covid might have caused fast food restaurants to shut down or not be opened at the same rate

In [None]:
# add bar chart for diabetes per year
# x- axis is Year, y-axis is % Diabetic

In [None]:
plt.plot(years, diabetic_population)

plt.title("Total Illinois Diabetic Population by Year")
plt.grid()
plt.xlabel("Year")
plt.ylabel("Diabetic Populations")



In [None]:
# add column for fast food per capita
diabetes_fastfood_df["Restaurant per Capita"] = \
                    diabetes_fastfood_df["Restaurant Count"]/(diabetes_fastfood_df["Population"]/1000)
diabetes_fastfood_df.head()

In [None]:
# plot scatter for fast food per capita on x and % diabetes on y, a chart for each year
# 2011


for year in years:
    year_data = diabetes_fastfood_df.loc[diabetes_fastfood_df["Year"] == year, :]
    restaurants = year_data["Restaurant per Capita"]
    diabetes = year_data["% Diabetic"]
    plt.scatter(restaurants,diabetes)
    plt.title(year)
    # add labels and titles
    plt.show()


In [None]:
# Linear regression for all years
x_values = diabetes_fastfood_df['Restaurant per Capita']
y_values = diabetes_fastfood_df['% Diabetic']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(0.7,17),fontsize=15,color="red")
plt.xlabel('Restaurant per Capita')
plt.ylabel('% Diabetic')
print(f"The r-squared is: {rvalue**2}")
plt.show()