In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Overview

**Question to answer:** 
Is there a relationship between the meat consumption and food borne diseases in the U.S.?

**Data analysis:**
The data will be cleaned and be analyzed using a correlation test, as well as seeing if there are any interesting results to be had from comparisons and visualizations.

# Data Profiles

1. [Worldwide Meat Consumption](https://www.kaggle.com/vagifa/meatconsumption)
*  (Links to an external site.)Description: Worldwide consumption of different meats by country from 1991
* Data sourced from OECD.org
* To be extracted: U.S. meat consumption numbers.
* Usage license: Database: Open Database
* Dataset: data on locations and states will be not be used. The analysis will focus on the source, type, number of illnesses

2. [Foodborne Disease Outbreaks, 1998-2015](https://www.kaggle.com/cdc/foodborne-diseases/metadata)
* Description: What contaminant has caused the most hospitalizations and fatalities?
* Data sourced from CDC
* To be extracted: number of Foodborne disease outbreaks by year and type
* Usage license: CC0: Public Domain
* Data about non-U.S. countries will be excluded. Analysis will focus on using meat type and time (year).

# Analysis

In [None]:
import csv
import matplotlib.pyplot as plt

meat_df = pd.read_csv ('../input/meatconsumption/meat_consumption_worldwide.csv')
illness_df = pd.read_csv('../input/foodborne-diseases/outbreaks.csv')


### Overview of Meat Consumption

In [None]:
meat_df.head()

In [None]:
meat_df.describe()

### Overview of Food Borne Illnesses

In [None]:
illness_df.head()

In [None]:
illness_df.describe()

### Extract the relevant U.S. data from Meat Consumptions 

In [None]:
meat_df = meat_df.loc[meat_df['LOCATION'] == 'USA']
meat_df = meat_df.loc[meat_df['MEASURE'] == 'THND_TONNE']
print(meat_df["SUBJECT"].unique())
print(meat_df["TIME"].unique())
print(illness_df["Year"].unique())


In [None]:
meat_df = meat_df = meat_df.loc[(meat_df['TIME'] >= 1998) & (meat_df['TIME'] <= 2015)]

meat_df

#### Beef Consumption

In [None]:
beef_df = meat_df.loc[meat_df['SUBJECT'] == 'BEEF']

year = 1998
year_label = []
while year < 2016:
    year_label.append(str(year))
    year = year + 1
    
print(year_label)

beef_consumption = []

for amount in beef_df["Value"]:
    beef_consumption.append(amount)

plt.plot(year_label, beef_consumption)

#### Sheep Consumption

In [None]:
sheep_df = meat_df.loc[meat_df['SUBJECT'] == 'SHEEP']

sheep_consumption = []

for amount in sheep_df["Value"]:
    sheep_consumption.append(amount)

plt.plot(year_label, sheep_consumption)

#### Pig Consumption

In [None]:
pig_df = meat_df.loc[meat_df['SUBJECT'] == 'PIG']

pig_consumption = []

for amount in pig_df["Value"]:
    pig_consumption.append(amount)

plt.plot(year_label, pig_consumption)

#### Poultry Consumption

In [None]:
poultry_df = meat_df.loc[meat_df['SUBJECT'] == 'POULTRY']

poultry_consumption = []

for amount in poultry_df["Value"]:
    poultry_consumption.append(amount)

plt.plot(year_label, poultry_consumption)

#### All Meat Consumption

In [None]:
all_meats_df = pd.DataFrame({'beef': beef_consumption,
                   'sheep': sheep_consumption,
                   'pig': pig_consumption,
                   'poultry': poultry_consumption}, index = year_label)
lines = all_meats_df.plot.line()

In [None]:
import numpy
all_meat_consumption = numpy.array([beef_consumption, sheep_consumption, pig_consumption, poultry_consumption])
all_meat_consumption = all_meat_consumption.sum(axis=0)
plt.plot(year_label, all_meat_consumption)

### Food Borne Illness Aggregate

In [None]:
illness_cases = [] 

for row in illness_df["Year"]:
    index = row - 1998
    if index < len(illness_cases):
        illness_cases[index] = illness_cases[index] + 1
    else:
        illness_cases.append(1)
        
for year in year_label:
    print((str(year)) +": " + str(illness_cases[int(year) - 1998]))

In [None]:
plt.plot(year_label, illness_cases)
    

# Results

### Comparison with Meat Consumption

In [None]:
meat_and_illness = pd.DataFrame({'Meat Consumption': all_meat_consumption,
                   'Illness Cases': illness_cases}, index = year_label)
lines = meat_and_illness.plot.line()

In [None]:
plt.plot(all_meat_consumption, illness_cases,'o')

# calc the trendline
z = numpy.polyfit(all_meat_consumption, illness_cases, 1)
p = numpy.poly1d(z)
plt.plot(all_meat_consumption,p(all_meat_consumption),"r--")

# print "y=%.6fx+(%.6f)"%(z[0],z[1])

# plt.scatter(all_meat_consumption, illness_cases)
# linear_regressor = LinearRegression()
# linear_regressor.fit(all_meat_consumption.reshape(-1, 1), illness_cases.reshape(-1, 1))
# illness_cases_pred = linear_regressor.predict(all_meat_consumption.reshape(-1, 1))
# plt.plot(all_meat_consumption, illness_cases_pred, color='red')

### Correlation Test

In [None]:
all_meat_consumption_arr = np.array(all_meat_consumption)
illness_cases_arr = np.array(illness_cases)

r = np.corrcoef(all_meat_consumption_arr, illness_cases_arr)
r

# Conclusion

From the visualizations and correlation test, we can see that there is a moderate amount of correlation between the amount of meat consumption and food-borne illnesses with a Pearson's correlation coefficient of -0.45. However, the data for the food borne illnesses only provides a general number of how many people got food borne illness with most of the data on the reason being unknown. Hence, it can only be said that there is a moderate correlation without much reason to believe it has causation.

### Future Work

Because of the lack of specificity in the food borne illnesses causes, future work can look at other data sets to see if these cases are linked primarily to meat consumption or the consumption of other foods. Furthurmore, it would be of interest to look at the number of the per-state food borne illness causes and compare it to the amount of centralized factory farming, which has been known to cause cases of food safety concerns in both meat and vegetables.