In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('../input/worldwide-deaths-by-risk-factors/number-of-deaths-by-risk-factor.csv')
data

In [None]:
data.shape #Prints the rows and columns in the dataframe

In [None]:
#Checking first n rows of the dataframe
data.head(7) # head() reads first n rows of the dataframe

In [None]:
#Similarly we can read last n rows of the dataframe using tail()

data.tail(10)

In [None]:
#Lets check what all features/columns we have in the dataframe

data.columns

In [None]:
#lets check what type of present in each feature/columns

data.info() #So from the below output we know that our data contains 1-object,1-int,29-float type data

In [None]:
#lets check the count of unique values present in Entity column

data['Entity'].unique()


In [None]:
data['Year'].nunique() #Printing the count of unique values in Year column

In [None]:
#lets check null values in each columns

data.isnull().sum()

In [None]:
data['High total cholesterol'] = data['High total cholesterol'].fillna(value=data['High total cholesterol'].mean())
data['Outdoor air pollution'] = data['Outdoor air pollution'].fillna(value=data['Outdoor air pollution'].mean())

In [None]:
#Lets check again for null values in the data

data.isnull().sum()

In [None]:
#Now from the above output we can say our data does not contains any null values

#Lets copy clean data to new variable

clean_data = data.copy()

In [None]:
clean_data.head()

In [None]:
clean_data.shape

In [None]:
#Lets check describe function on data

clean_data.describe()

In [None]:
#Since we have more columns lets transpose it.
clean_data.describe().transpose()

# Data Visualization

### INDIA - Deaths Analysis

In [None]:
#creating a dataframe which contains only india data.

india = clean_data[clean_data['Entity']=='India']
india

In [None]:
#Lets check Average Deaths due to each Risk factor

risk_factors = [rf for rf in india.columns if rf not in ['Entity','Year']]
risk_factors

In [None]:
#Lets calculate the average deaths for each risk factor
average_deaths = []

for rf in risk_factors:
    average_deaths.append(india[rf].mean())
average_deaths

In [None]:
#making a dataframe to store riskfactors and average deaths
df = pd.DataFrame(list(zip(risk_factors,average_deaths)),columns=['Risk Factor','Avg. Deaths']).sort_values(by='Avg. Deaths',ascending=False)

In [None]:
#lets Plot the Graph to undestand the Deaths for each risk factor

plt.figure(figsize=(8,8),dpi=300)
sns.barplot(y='Risk Factor',x='Avg. Deaths',data=df)
plt.title('Avg. Deaths due to Each Risk Factor - INDIA')
plt.xlabel('Avg. Deaths (in. lakhs)')
plt.show()

'''
Observations : 


-->From the above chart we can say that top 5 major risk factors for deaths in India are 
Air pollution,High systolic blood pressure,Smoking,Unsafe water source and High fasting plasma glucose.

--> From the chart we can say least deaths are due to Discontinued breastfeeding

--> Highest due to Air pollution,High systolic blood pressure,Smoking

'''

In [None]:
#Lets check deaths in india due to Air pollution
plt.figure(figsize=(10,8))
sns.barplot(x='Year',y='Air pollution',data=india,palette='viridis')
plt.xticks(rotation=90)
plt.title('Deaths due to Air pollution - INDIA')
plt.ylim(800000,1250000)
plt.ylabel('Deaths by Air pollution (in lakhs.)')
plt.show()

Observations:  

---> From the Above Chart we can say Highest Deaths due to Air pollution is in 2017 and lowest in 2004


In [None]:
plt.figure(figsize=(10,10),tight_layout=True)
e=[0 if i<27 else 1 for i in range(28)]
plt.pie(india['Smoking'],labels=india.Year,autopct='%.2f%%',pctdistance=0.9,explode=e)
plt.title('Percentage of Deaths by Smoking - INDIA')
plt.show()

Observations:

--->From the above chart we can say that highest deaths due to Smoking is in the Year - 2017 and Lowest deaths in the Year - 1990


In [None]:
plt.figure(figsize=(10,8))
plt.plot(india.Year,india['Smoking'],label='Air pollution')
plt.plot(india.Year,india['Air pollution'],label='Smoking')
plt.plot(india.Year,india['High systolic blood pressure'],label='HS-blood pressure')
plt.legend()
plt.xlabel('Year')
plt.ylabel('Deaths  (in. lakhs)')
plt.title('Deaths each Year in India')
plt.show()

# 'World' Death Analysis

In [None]:
# Lets get the world data in a new variable

world = clean_data[clean_data['Entity']=='World']
world

In [None]:
#Lets plot each risk factor

risk_factors = [rf for rf in india.columns if rf not in ['Entity','Year']]

plt.figure(figsize=(18,50))

for index,rf in enumerate(risk_factors):
    plt.subplot(10,3,index+1)
    plt.bar(world['Year'],world[rf],label=rf)
    plt.title(rf)
    plt.ylabel('Deaths')
    plt.xlabel('Year')

plt.show()

In [None]:
#Comparison of Average Deaths in INDIA v/s World each Year

india_world = clean_data[(clean_data['Entity']=='India') | (clean_data['Entity']=='World')]
india_world

In [None]:
ad_india = []
ad_world = []

for i in ['India','World']:
    df = india_world[india_world['Entity']==i]
    
    if i=='India':
        for rf in risk_factors:
            ad_india.append(df[rf].mean())
    else:
        for rf in risk_factors:
            ad_world.append(df[rf].mean())

df1 = pd.DataFrame(list(zip(risk_factors,ad_india)),columns=['Risk Factor','Avg. Deaths']).sort_values(by='Avg. Deaths',ascending=False)
df2 = pd.DataFrame(list(zip(risk_factors,ad_world)),columns=['Risk Factor','Avg. Deaths']).sort_values(by='Avg. Deaths',ascending=False)

In [None]:
fig,axes = plt.subplots(1,2,sharey=True,figsize=(12,6),tight_layout=True,dpi=300)
fig.suptitle('Comparision of Average Deaths in INDIA v/s World')
sns.barplot(ax=axes[0],y='Risk Factor',x='Avg. Deaths',data=df1,palette='viridis')
axes[0].set_title('Avg. Deaths in INDIA')
axes[0].set_xlabel('Deaths  (in lakhs.)')
sns.barplot(ax=axes[1],y='Risk Factor',x='Avg. Deaths',data=df2,palette='viridis')
axes[1].set_title('Avg. Deaths in WORLD')
axes[1].set_xlabel('Deaths  (in lakhs.)')
axes[1].set_ylabel('')
plt.show()

In [None]:
fig,axes = plt.subplots(2,1,figsize=(15,15),tight_layout=True)
axes[0].pie(df1['Avg. Deaths'][:15],labels=df1['Risk Factor'][:15],autopct='%.1f%%',pctdistance=0.8)
axes[0].set_title('Average %Deaths in INDIA')
axes[1].pie(df2['Avg. Deaths'][:15],labels=df2['Risk Factor'][:15],autopct='%.1f%%',pctdistance=0.8)
axes[1].set_title('Average %Deaths in World')
plt.show()