# Project 1
---

### Analysis

- Here is where the analysis will be.

## Setup
--------

In [1]:
# Dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st

In [2]:
#Import master data file
master_data_df = pd.read_csv("data.csv")
master_data_df.head(1)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,Unnamed: 60
0,Arab World,ARB,% of females ages 15-49 having comprehensive c...,SH.HIV.KNOW.FE.ZS,,,,,,,...,,,,,,,,,,


In [3]:
#Limit years for cleaned data to 2006-2015
cleaned_data_df = master_data_df [["Country Name", "Indicator Name", "2006", "2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015"]]

#Limit countries for cleaned data to 10 largest in the world
cleaned_data_df = cleaned_data_df .loc[(cleaned_data_df ["Country Name"] == "China") |
                     (cleaned_data_df ["Country Name"] == "India") |
                     (cleaned_data_df ["Country Name"] == "United States") |
                     (cleaned_data_df ["Country Name"] == "Indonesia") |
                     (cleaned_data_df ["Country Name"] == "Brazil") |
                     (cleaned_data_df ["Country Name"] == "Pakistan") |
                     (cleaned_data_df ["Country Name"] == "Nigeria") |
                     (cleaned_data_df ["Country Name"] == "Bangladesh") |
                     (cleaned_data_df ["Country Name"] == "Russia") |
                     (cleaned_data_df ["Country Name"] == "Mexico")
                     , :]

#Limit indicators to those relevant for planned analysis
cleaned_data_df = cleaned_data_df .loc[(cleaned_data_df ["Indicator Name"] == "Mortality rate, adult, male (per 1,000 male adults)") |
                     (cleaned_data_df ["Indicator Name"] == "Mortality rate, adult, female (per 1,000 female adults)") |
                     (cleaned_data_df ["Indicator Name"] == "Population, total") |
                     (cleaned_data_df ["Indicator Name"] == "Population, male") |
                     (cleaned_data_df ["Indicator Name"] == "Population, female") |
                     (cleaned_data_df ["Indicator Name"] == "Urban population") |
                     (cleaned_data_df ["Indicator Name"] == "Rural population") |
                     (cleaned_data_df ["Indicator Name"] == "Physicians (per 1,000 people)") |
                     (cleaned_data_df ["Indicator Name"] == "Prevalence of overweight (% of adults)") |
                     (cleaned_data_df ["Indicator Name"] == "Prevalence of overweight, male (% of male adults)") |
                     (cleaned_data_df ["Indicator Name"] == "Prevalence of overweight, female (% of female adults)")
                     , :]

#Display cleaned dataframe
cleaned_data_df.head(1)

Unnamed: 0,Country Name,Indicator Name,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
19505,Bangladesh,"Mortality rate, adult, female (per 1,000 femal...",135.856,131.575,127.94,124.306,120.672,117.038,113.403,110.296,107.188,


In [4]:
cleaned_data_df = cleaned_data_df.reset_index(drop=True)
cleaned_data_df.index.name = "Index"
cleaned_data_df.head(1)

Unnamed: 0_level_0,Country Name,Indicator Name,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,Bangladesh,"Mortality rate, adult, female (per 1,000 femal...",135.856,131.575,127.94,124.306,120.672,117.038,113.403,110.296,107.188,


In [5]:
column_averages = cleaned_data_df[["2006","2007","2008","2009","2010","2011","2012","2013","2014","2015"]].mean(axis = 1, skipna = True)
column_averages_df = pd.DataFrame(column_averages)
column_averages_df=column_averages_df.rename(columns = {0:"2006-2015 AVG"})
column_averages_df.index.name = "Index"
column_averages_df.head(1)

Unnamed: 0_level_0,2006-2015 AVG
Index,Unnamed: 1_level_1
0,120.919333


In [6]:
cleaned_data_df = pd.merge(cleaned_data_df,column_averages_df, on = "Index", how = "inner")
cleaned_data_df.head(1)

Unnamed: 0_level_0,Country Name,Indicator Name,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2006-2015 AVG
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,Bangladesh,"Mortality rate, adult, female (per 1,000 femal...",135.856,131.575,127.94,124.306,120.672,117.038,113.403,110.296,107.188,,120.919333


In [7]:
cleaned_data_df = cleaned_data_df [["Country Name","Indicator Name", "2006-2015 AVG"]]
cleaned_data_df.head(1)

Unnamed: 0_level_0,Country Name,Indicator Name,2006-2015 AVG
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Bangladesh,"Mortality rate, adult, female (per 1,000 femal...",120.919333


In [8]:
# Calculate Total Average Mortality Rates
countries = cleaned_data_df["Country Name"].unique().astype(str)
country_list = []
total_avg_mort = []
for country in countries:

    country_df = cleaned_data_df.loc[cleaned_data_df["Country Name"] == country, :]
    total_mort = country_df.loc[(country_df["Indicator Name"] == "Mortality rate, adult, male (per 1,000 male adults)") | (country_df["Indicator Name"] == "Mortality rate, adult, female (per 1,000 female adults)") , "2006-2015 AVG"].sum()
    
    country_list.append(country)
    total_avg_mort.append(total_mort)


In [9]:
# Create Dataframe for Total Average Mortality Rates
total_avg_mort_df = pd.DataFrame({
    "Country Name" : country_list,
    "Indicator Name" : "Mortality Rate, Total",
    "2006-2015 AVG" : total_avg_mort,
    })
total_avg_mort_df.head(1)

Unnamed: 0,Country Name,Indicator Name,2006-2015 AVG
0,Bangladesh,"Mortality Rate, Total",280.015889


In [10]:
# Combine cleaned_data_df and total_avg_mort_df ?
cleaned_data_df = cleaned_data_df.append(total_avg_mort_df,ignore_index=True)
cleaned_data_df.head(1)

Unnamed: 0,Country Name,Indicator Name,2006-2015 AVG
0,Bangladesh,"Mortality rate, adult, female (per 1,000 femal...",120.919333


In [11]:
# Sort data
cleaned_data_df = cleaned_data_df.sort_values(["Country Name", "Indicator Name"], ascending =True)
cleaned_data_df = cleaned_data_df.reset_index(drop=True)
cleaned_data_df

Unnamed: 0,Country Name,Indicator Name,2006-2015 AVG
0,Bangladesh,"Mortality Rate, Total",2.800159e+02
1,Bangladesh,"Mortality rate, adult, female (per 1,000 femal...",1.209193e+02
2,Bangladesh,"Mortality rate, adult, male (per 1,000 male ad...",1.590966e+02
3,Bangladesh,"Physicians (per 1,000 people)",3.137500e-01
4,Bangladesh,"Population, female",7.543719e+07
...,...,...,...
103,United States,Prevalence of overweight (% of adults),6.640000e+01
104,United States,"Prevalence of overweight, female (% of female ...",6.170000e+01
105,United States,"Prevalence of overweight, male (% of male adults)",7.120000e+01
106,United States,Rural population,5.935318e+07


In [12]:
# Export cleaned data to CSV
cleaned_data_df.to_csv("cleaned_data.csv", index=False)

In [13]:
# Read cleaned data csv
final_cleaned_df = pd.read_csv("cleaned_data.csv")
final_cleaned_df

Unnamed: 0,Country Name,Indicator Name,2006-2015 AVG
0,Bangladesh,"Mortality Rate, Total",2.800159e+02
1,Bangladesh,"Mortality rate, adult, female (per 1,000 femal...",1.209193e+02
2,Bangladesh,"Mortality rate, adult, male (per 1,000 male ad...",1.590966e+02
3,Bangladesh,"Physicians (per 1,000 people)",3.137500e-01
4,Bangladesh,"Population, female",7.543719e+07
...,...,...,...
103,United States,Prevalence of overweight (% of adults),6.640000e+01
104,United States,"Prevalence of overweight, female (% of female ...",6.170000e+01
105,United States,"Prevalence of overweight, male (% of male adults)",7.120000e+01
106,United States,Rural population,5.935318e+07


## Analysis
--------

#### Mortality Rate and Sex
--------

In [14]:
# Create bar chart with mortality rate as y-axis and each of the sexes as a bar across all nations

#### Mortality Rate and Urbanization
--------

In [15]:
# Create bar chart with mortality rate as y-axis and urban and rural populations as bars across all nations

In [22]:
# Create dataframe to run visualization on
urbanization_df = final_cleaned_df.loc[(final_cleaned_df["Indicator Name"] == "Mortality Rate, Total") |
(final_cleaned_df["Indicator Name"] == "Rural population") |
(final_cleaned_df["Indicator Name"] == "Urban population")
,:]
urbanization_df

Unnamed: 0,Country Name,Indicator Name,2006-2015 AVG
0,Bangladesh,"Mortality Rate, Total",280.0159
10,Bangladesh,Rural population,105464300.0
11,Bangladesh,Urban population,47245800.0
12,Brazil,"Mortality Rate, Total",308.0899
22,Brazil,Rural population,30965870.0
23,Brazil,Urban population,168501300.0
24,China,"Mortality Rate, Total",182.6766
34,China,Rural population,671983300.0
35,China,Urban population,669038700.0
36,India,"Mortality Rate, Total",386.0592


#### Mortality Rate and Access to Physicians
--------

In [17]:
# Create scatterplot with mortality rate as y-axis and pysicians (per 1000 people) as the x-axis across all nations

#### Mortality Rate and Obesity
--------

In [18]:
# Create scatterplot with mortality rate as y-axis and prevalence of overweight (% of adults) as the x-axis across all nations