In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### As the world’s population has expanded and gotten richer, the demand for food, energy and water has seen a rapid increase. Not only has demand for all three increased, but they are also strongly interlinked: food production requires water and energy; traditional energy production demands water resources; agriculture provides a potential energy source. This article focuses on the environmental impacts of food.

### Eutrophication – the pollution of water bodies and ecosystems with excess nutrients – is a major environmental problem. The runoff of nitrogen and other nutrients from agricultural production systems is a leading contributor.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
color = sns.color_palette()

In [None]:
df = pd.read_csv('../input/environment-impact-of-food-production/Food_Production.csv')
print(df.shape)
print(df.info())

In [None]:
df.head()

### Exploratory Data Analysis

In [None]:
for col in df.columns:
    print(col, len(df[col].unique())," ------ ",df[col].isnull().sum()," ------ ", (df[col].isnull().sum()/43))

In [None]:
df.describe()

### Analysis

In [None]:
plt.figure(figsize=(8,5))
plt.scatter(range(df.shape[0]), np.sort(df.Total_emissions.values), s= 50)
plt.xlabel('Food Product', fontsize=12)
plt.ylabel('Total Emissions', fontsize=12)
plt.show()

**Food Products ranging from 30 to 40 contribute majorly to Emissions**

In [None]:
food_df= df.groupby("Food product")['Total_emissions'].sum()
print(type(food_df))
food_df

### Total Emissions by Food Products

In [None]:
import plotly.graph_objects as go
import numpy as np

In [None]:
df.columns

In [None]:
food = df['Food product'].to_list()
print(type(food))


In [None]:
# np.random.seed(1)
df.columns = df.columns.str.replace(' ', '')
N=100
x = list(range(food_df.shape[0]))
y = df['Total_emissions'].to_list()
#colors = np.random.rand(N)
#sz = np.random.rand(N) * 70
sz = y * 5000

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=x,
    y=y,
    mode="markers",
    marker=go.scatter.Marker(
        size=sz,
        color=x*5000,
        opacity=0.6,
        colorscale="Viridis"
    )
))
fig.update_layout(xaxis = dict(title_text='Food Product', tickmode = 'array', tickvals = list(range(0,43)),ticktext = food),
    yaxis = dict(title_text='Total Emissions'))

fig.show()

In [None]:
em_df= df.sort_values(by= "Total_emissions", ascending= True).iloc[:,:8]
em_df = em_df.set_index('Foodproduct')
print(em_df.shape)
em_df.head()

In [None]:
fig, ax = plt.subplots(figsize=(20,25))
sns.set()
em_df.plot(kind='barh', stacked=True, ax= ax)
plt.xlabel("Contributors to Greenhouse gas Emissions")
plt.show()

## Eutrophying Emissions

In [None]:
eu_df= df.sort_values(by= "Freshwaterwithdrawalsperkilogram(litersperkilogram)", ascending= True).iloc[:,[0,9,10,11,12,13,14]]
eu_df = eu_df.set_index('Foodproduct')
print(eu_df.shape)
eu_df.head()

In [None]:
fig, ax = plt.subplots(figsize=(20,25))
sns.set()
eu_df.plot(kind='barh', stacked=True, ax= ax)
plt.xlabel("Contributors to Eutrophying Emissions")
plt.show()

#### Removing NAs from the dataset

In [None]:
for col in df.columns:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(),inplace=True)

df.isnull().sum()

In [None]:
cor = df.corr()
cor

In [None]:
upper_tri = cor.where(np.triu(np.ones(cor.shape),k=1).astype(np.bool))

##### So we are selecting the columns which are having absolute correlation greater than 0.95 and making a list of those columns named 'to_drop'.

In [None]:
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.83)]
print(to_drop)

In [None]:
df1 = df.drop(to_drop, axis=1)
df1.head()

In [None]:
df1.shape

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df1 = scaler.fit(df1.iloc[:,1:])


In [None]:
df.iloc[:,1:]

In [None]:
emkc_df= df.sort_values(by= "Total_emissions", ascending= True).iloc[:,[0,8,15,17]]
emkc_df = emkc_df.set_index('Foodproduct')
print(emkc_df.shape)
emkc_df.head()

In [None]:
fig, ax = plt.subplots(figsize=(20,25))
sns.set()
emkc_df.plot(kind='barh', stacked=True, ax= ax)
plt.xlabel("Carbon Footprint per 1000 kCal")
plt.show()

In [None]:
import plotly.express as px

fig = px.pie(df, values="Scarcity-weightedwateruseper1000kcal(litersper1000kilocalories)", names='Foodproduct', color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

## Eutrophication
**excessive richness of nutrients in a lake or other body of water, frequently due to run-off from the land, which causes a dense growth of plant life.**

In [None]:
landvsWater_df= df.sort_values(by= "Total_emissions", ascending= True).iloc[:,[0,14,18]]
#landvsWater_df = landvsWater_df.set_index('Foodproduct')
print(landvsWater_df.shape)
landvsWater_df.head()

In [None]:
import plotly.express as px
#df = px.data.iris()
fig = px.scatter(landvsWater_df, x="Freshwaterwithdrawalsperkilogram(litersperkilogram)", y="Landuseperkilogram(m²perkilogram)", color="Foodproduct",size='Freshwaterwithdrawalsperkilogram(litersperkilogram)', hover_data=['Freshwaterwithdrawalsperkilogram(litersperkilogram)','Landuseperkilogram(m²perkilogram)'])
fig.show()