In [13]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from pathlib import Path

In [14]:
# File paths for loading
GDP_data_to_load = Path("data/GDP.csv")
particle_data_to_load = Path("data/part-con.csv")
lung_data_to_load = Path("data/dataset-cumulative-risk-in-percentage-inc-males-and-females-age-0-74-lung.csv")
melanoma_data_to_load = Path("data/dataset-cumulative-risk-in-percentage-inc-males-and-females-age-0-74-2000-2018-melanoma-of-skin.csv")


In [18]:
# Read and clean particle data
particle_data = pd.read_csv(particle_data_to_load)

particle_keep_columns = ["Location", "Period", "Dim1", "FactValueNumeric", "FactValueNumericLow",
                          "FactValueNumericHigh"]
particle_data = particle_data[particle_keep_columns]
particle_data = particle_data.rename(columns={
    "Location": "Country",
    "Period": "Year",
    "Dim1": "Area Type",
    "FactValueNumeric": "Average Weighted Concentration",
    "FactValueNumericLow": "Low Concetration",
    "FactValueNumericHigh": "High Concentration"
})

In [19]:
# Clean GDP_data
GDP_data = pd.read_csv(GDP_data_to_load)
# drop empty columns and columns with repititious and useless values
gdp_drop_columns = ["GDP", "Scale", "Base Year"]
GDP_data = GDP_data.drop(columns=gdp_drop_columns)

GDP_data["Average GDP"] = ""

years = ["2017", "2018", "2019", "2020", "2021", "2022"]
for index, row in GDP_data.iterrows():
    num_elems = 0
    total = 0
    for y in years:
        if row[y] != "...":
            num_elems += 1
            parts = row[y].split(',')
            comb_str = ""
            for p in parts:
                comb_str += p
            total += float(comb_str)
    row["Average GDP"] = round(total/float(num_elems), 2)


In [61]:
# Make function for cleaning cancer data
def clean_cancer_data(df):
    drop_cols = ["Cancer id", "Cancer label", "Population id", "Type"]
    rename_dict = {"Country label": "Country"}
    clean_df = df.drop(columns=drop_cols)
    clean_df = clean_df.rename(columns=rename_dict)
    clean_df["Sex"] = clean_df["Sex"].replace(1, "Male")
    clean_df["Sex"] = clean_df["Sex"].replace(2, "Female")
    clean_df["Country"] = clean_df["Country"].replace("Kuwait: Kuwaiti", "Kuwait")
    return clean_df

# Load and clean skin and lung cancer data
lung_data = pd.read_csv(lung_data_to_load)
melanoma_data = pd.read_csv(melanoma_data_to_load)

lung_data = clean_cancer_data(lung_data)
melanoma_data = clean_cancer_data(melanoma_data)

In [74]:
def match_country_names(particle_df, gdp_df, cancer_country, particle_country, gdp_country):
    particle_df["Country"] = particle_df["Country"].replace(particle_country, cancer_country)
    gdp_df["Country"] = gdp_df["Country"].replace(gdp_country, cancer_country)

match_country_names(particle_data, GDP_data, "USA", "United States of America", "United States")
match_country_names(particle_data, GDP_data, "Republic of Korea", "Republic of Korea", "Korea, Rep. of")
match_country_names(particle_data, GDP_data, "China", "China", "China, P.R.: Mainland")
match_country_names(particle_data, GDP_data, "Belarus", "Belarus", "Belarus, Rep. of")
match_country_names(particle_data, GDP_data, "Turkey", "Türkiye", "Türkiye, Rep of")

In [21]:
particle_data.head()

Unnamed: 0,Country,Year,Area Type,Average Weighted Concentration,Low Concetration,High Concentration
0,Kenya,2019,Cities,10.01,6.29,13.74
1,Trinidad and Tobago,2019,Rural,10.02,7.44,12.55
2,United Kingdom of Great Britain and Northern I...,2019,Cities,10.06,9.73,10.39
3,Grenada,2019,Total,10.08,7.07,13.2
4,Brazil,2019,Towns,10.09,8.23,12.46


In [22]:
GDP_data.head()

Unnamed: 0,Country,2017,2018,2019,2020,2021,2022,Average GDP
0,"Afghanistan, Islamic Rep. of",1285460.00,1327690.0,1469600.0,1547290.0,1232858.3,...,1372579.66
1,Albania,1550645.49,1636731.32,1691903.43,1647431.07,1856172.26,2134463.55,1752891.19
2,Algeria,...,20393524.41,20500200.0,18477000.0,22079300.0,...,20362506.1
3,"Andorra, Principality of",2655.76,2725.27,2818.42,2531.09,2815.42,...,2709.19
4,Angola,20262300.00,25627742.12,30330429.4,33063020.24,46696955.73,...,31196089.5


In [41]:
lung_data.head()

Unnamed: 0,Country,Sex,Year,ASR (World),Crude rate,Cumulative risk,Total
0,Belarus,Male,2000,59.877021,75.817569,8.360745,3484
1,Belarus,Male,2001,59.234048,74.673718,8.204052,3416
2,Belarus,Male,2002,57.087689,72.084418,7.935269,3278
3,Belarus,Male,2003,58.440589,73.686095,8.17829,3310
4,Belarus,Male,2004,58.032474,72.485494,8.145855,3230


In [42]:
melanoma_data.head()

Unnamed: 0,Country,Sex,Year,ASR (World),Crude rate,Cumulative risk,Total
0,Belarus,Male,2000,2.569562,3.1772,0.330698,146
1,Belarus,Male,2001,2.582797,3.300858,0.320393,151
2,Belarus,Male,2002,2.609352,3.320545,0.319802,151
3,Belarus,Male,2003,2.699419,3.316987,0.324824,149
4,Belarus,Male,2004,3.623803,4.510707,0.424321,201


In [78]:
# Merge all the dataframes

# particle_years = list(particle_data["Year"].unique())
# lung_years = list(lung_data["Year"].unique())
# overlap_years = list(set(lung_years) & set(particle_years))
# overlap_years.sort()

lung_pollution_data = lung_data.merge(particle_data, how='left', on=['Country', 'Year']).dropna()
melanoma_pollution_data = melanoma_data.merge(particle_data, how='left', on=['Country', 'Year']).dropna()

lung_full_data = lung_pollution_data.merge(GDP_data, how='left', on='Country').dropna()
melanoma_full_data = melanoma_pollution_data.merge(GDP_data, how='left', on='Country').dropna()

In [None]:
# Find the Countries that have data in all files (ex. US, Germany, Norway, etc.) and rank them from highest GDP to lower GDP?

#create a bar chart


In [None]:
# find the countries with the highest polution indices

#create a bar chart


In [None]:
# create a scatter plot with and GDP (Y) and pollution level (X)

# run a regression line

In [None]:
# Find the number of lung cancer victims for our Countries sample

# Creat a bar chart that reflects number lung cancer victims per country and their GDP and pollution level

In [None]:
# Find the number of melanoma cancer victims for our Countries sample

# Creat a bar chart

# Creat a bar chart that reflects number of melanoma victims per country and their GDP and pollution level
