In [1]:
# Plastic Deposit into World's Oceans
# Barbara Ulitsky - September 2nd, 2021

# In this notebook we explore what are the largest sources of plastic deposit into the world's oceans.
# We will show the top 10 rivers where the most plastic enters from into the oceans and the top countries.

In [4]:
# Set up
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

pp = pd.read_csv(os.path.join(dirname, filename))

NameError: name 'dirname' is not defined

In [None]:
chart_color = 'lightsteelblue'

pp.rename(columns = {'Entity': 'River'}, inplace = True)
pp.head()

In [None]:
NumRivers = len(pp['River'].unique())
NumCountries = len(pp['Country'].unique())

print('In our dataset we have', NumRivers, 'rivers in' , NumCountries, 'countries.')

max_rivers_in_countries = pp.groupby('River').count()['Country'].max()
print('Every river belongs only to', max_rivers_in_countries, 'country.\n')

print('The total share of plastic in our dataset adds up to about', '{:2.0f}%'.format(pp['sharePlastics'].sum()),
     ', which could mean that rivers are not the only source \n \
     of plastic pollution in the oceans or that we dont have all the countries, or something else.')

In [None]:
# Top rivers where the most plastic in the oceans originates from and the countries they are in :-(

# input here the number of top rivers you want to see, from 1 to 50
TopXRivers = 10

TopXRivers = min(max(TopXRivers, 1), NumRivers)
WorstRivers = pp.sort_values(by='sharePlastics', ascending=False)
WorstRivers = WorstRivers.head(TopXRivers)

WorstRivers_bar = WorstRivers.sort_values(by='sharePlastics', ascending=True)
fig = plt.figure(figsize=(15,10))
chart_title = 'Top ' + str(TopXRivers) + ' rivers where plastic comes from into the oceans'
plt.title(chart_title, fontsize=16)
plt.xlabel('% share of plastic')
plt.ylabel('River')

plt.barh(WorstRivers_bar['River'], WorstRivers_bar['sharePlastics'], color=chart_color)
plt.show()
print()
print()

WorstRivers['sharePlastics'] = WorstRivers['sharePlastics'].map('{:,.2f}%'.format)
WorstRivers.set_index(keys=['River', 'Country'], inplace=True)
WorstRivers

In [None]:
# Top countries that dump the most plastic into world's oceans :-(

# input here the number of top countries you want to see from 1 to 16
TopXCountries = 10

TopXCountries = min(max(TopXCountries, 1), NumCountries)
WorstCountries = pp.groupby('Country', as_index=False).sum().sort_values(by='sharePlastics', ascending=False)
WorstCountries = WorstCountries.head(TopXCountries)
WorstCountries_bar = WorstCountries.sort_values(by='sharePlastics', ascending=True)

fig = plt.figure(figsize=(15,10))
chart_title = 'Top ' + str(TopXCountries) + ' countries where plastic comes from into the oceans'
plt.title(chart_title, fontsize=16)
plt.xlabel('% share of plastic')
plt.ylabel('Country')

plt.barh(WorstCountries_bar['Country'], WorstCountries_bar['sharePlastics'], color=chart_color)
plt.show()
print()
print()

WorstCountries['sharePlastics'] = WorstCountries['sharePlastics'].map('{:,.2f}%'.format)
WorstCountries.set_index(keys='Country', inplace=True)
WorstCountries

In [None]:
# The number of rivers in each country varies widely.
# For example, the Philippines has 19 rivers, a lot more than other countries and it is the largest source of plastic in oceans.
# In order to represent the countries more fairly, we will calculate the countries share of plastic per river.

NumRivers_per_Country = pp.groupby('Country')['River'].count().sort_values(ascending=False)
bins = list(range(0,21))

fig = plt.figure(figsize=(15,5))
plt.xticks(bins)
plt.yticks(list(range(11)))
plt.title('Histogram of number of rivers flowing through a country')
plt.xlabel('Number of rivers in a country')
plt.ylabel('Number of countries')

plt.hist(NumRivers_per_Country, bins, align='mid', rwidth=0.9, color=chart_color)
plt.show()

In [None]:
# make sure to run this cell only once in a session

WorstCountries = pp.groupby('Country', as_index=False).sum().sort_values(by='sharePlastics', ascending=False)
WorstCountries_per_River = WorstCountries \
                        .join(NumRivers_per_Country, how='inner', on='Country')\
                        .rename(columns={'River': 'NumRivers_per_Country'})

In [None]:
WorstCountries_per_River['sharePlastics_per_River'] = \
WorstCountries_per_River['sharePlastics'] / WorstCountries_per_River['NumRivers_per_Country']
WorstCountries_per_River = WorstCountries_per_River.nlargest(n=TopXCountries, columns=['sharePlastics_per_River'], keep='all')

In [None]:
WorstCountries_per_River_bar = WorstCountries_per_River.sort_values(by='sharePlastics_per_River', ascending=False)
WorstCountries_per_River_bar = WorstCountries_per_River_bar.head(TopXCountries)

WorstCountries_per_River_bar = WorstCountries_per_River_bar.sort_values(by='sharePlastics_per_River', ascending=True)

fig = plt.figure(figsize=(15,10))
chart_title = 'Top ' + str(TopXCountries) + ' Countries where the most plastic comes from into the oceans adjusted per river'
plt.title(chart_title, fontsize=16)
plt.xlabel('% share of plastic per river')
plt.ylabel('Country')

plt.barh(WorstCountries_per_River_bar['Country'], WorstCountries_per_River_bar['sharePlastics_per_River'], color=chart_color)
plt.show()
print()
print()

WorstCountries_per_River.set_index(keys='Country', inplace=True)
WorstCountries_per_River['sharePlastics'] = WorstCountries_per_River['sharePlastics'].map('{:,.2f}%'.format)
WorstCountries_per_River['sharePlastics_per_River'] = WorstCountries_per_River['sharePlastics_per_River'].map('{:,.2f}%'.format)
WorstCountries_per_River

In [None]:
WorstCountries_sharePlastics = list(WorstCountries_bar.sort_values(by='sharePlastics', ascending=False)['Country'])

In [None]:
WorstCountries_sharePlastics_per_River = list(WorstCountries_per_River_bar.sort_values(by='sharePlastics_per_River', ascending=False)['Country'])

In [None]:
print('Now we can see how the countries rankings changed from when we considered an absolute share of countries plastic pollution')
print('vs share of plastic pollution per river.')
print('The worst 3 polluting countries stayed in the worst 3 places using both calculation methods.')
print('Most of the worst top 10 countries stayed in the top 10, though some changed their relative rankings within the top 10.')
print()
print()
Countries_Rankings = pd.DataFrame({'Worst Countries by total share of plastic pollution': WorstCountries_sharePlastics, \
                     'Worst Countries by share of plastic pollution per river': WorstCountries_sharePlastics_per_River})
Countries_Rankings