In [None]:
!pip install matplotlib-venn
!apt-get -qq install -y libfluidsynth1
!pip install plotly_express


 # Loadiing library

In [None]:
# loadiing library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib.cm import rainbow
import plotly_express as px

%matplotlib inline

In [None]:
#to split the train test data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

# Loading dataset


In [None]:
# loading dataset
dataset = pd.read_csv('../input/disease-burden-by-cause/output.csv',index_col=0)
dataset.head(20)

In [None]:
dataset.info()

In [None]:
#Total 27 columns are there and 8010 rows are there. 25  columns having the features are numeric in nature
dataset.describe()

In [None]:
#Lets now check for null fields
import seaborn as sns
sns.heatmap(dataset.isnull(),yticklabels=False,cbar=False,cmap='viridis')

In [None]:
dataset.isnull().sum()

# Observation 
* Only  column name [Code] having null values                                    

In [None]:
#dataset.isna().any()
#all the nulls have been fixed


# Data visualization

In [None]:
#study the data
#dataset.hist()
sns.set_context('poster', font_scale=0.5)

dataset.hist(bins=25, grid=False, figsize=(25,18), color='#86bf91', zorder=2, rwidth=0.9)
plt.show()


# 'Cardiovascular diseases' world map

In [None]:
fig = px.choropleth(dataset,locations='Code',color='Cardiovascular diseases',scope='world',color_continuous_scale=px.colors.sequential.GnBu,
                    range_color=(10,50),title='Cardiovascular diseases',height=700,animation_frame ='Year')
fig.show()
sns.boxplot(dataset['Cardiovascular diseases'])

# 'Neoplasms diseases' world map

In [None]:
fig = px.choropleth(dataset,locations='Code',color='Neoplasms',scope='world',color_continuous_scale=px.colors.sequential.GnBu,
                    range_color=(10,32),title='Neoplasms',height=700,animation_frame ='Year')
fig.show()


# 'Neonatal disorders' world map¶

In [None]:
# 'Neonatal disorders' world map¶
sns.boxplot(dataset['Neonatal disorders'])


In [None]:
 fig = px.choropleth(dataset,locations='Code',color='Neonatal disorders',scope='world',color_continuous_scale=px.colors.sequential.GnBu,
                     range_color=(10,32),title='Neonatal disorders',height=700,animation_frame ='Year')
 fig.show()


In [None]:
fig = px.choropleth(dataset,locations='Code',color='Respiratory infections and tuberculosis',scope='world',color_continuous_scale=px.colors.sequential.GnBu,
                    range_color=(10,32),title='Respiratory infections and tuberculosis',height=700,animation_frame ='Year')
fig.show()
sns.boxplot(dataset['Respiratory infections and tuberculosis'])


In [None]:
sns.boxplot(dataset['Neurological disorders'])
fig = px.choropleth(dataset,locations='Code',color='Neurological disorders',scope='world',color_continuous_scale=px.colors.sequential.GnBu,
                    range_color=(10,32),title='Neurological disorders',height=700,animation_frame ='Year')
fig.show()

sns.boxplot(dataset['Neurological disorders'])


In [None]:
sns.boxplot(dataset['Maternal disorders'])
fig = px.choropleth(dataset,locations='Code',color='Maternal disorders',scope='world',color_continuous_scale=px.colors.sequential.GnBu,
                    range_color=(10,32),title='Maternal disorders',height=700,animation_frame ='Year')
fig.show()


# World Cluster Map

In [None]:
#World Cluster Map¶
dataset_1990_2019=dataset[(dataset['Year']==1990 )|( dataset['Year']==2019)]
dataset_1990_2019=dataset_1990_2019.dropna()
dataset_1990_2019=dataset_1990_2019[dataset_1990_2019['Entity'] != 'World']
dataset_1990_2019=dataset_1990_2019.drop('Entity',axis=1)



In [None]:
from sklearn.cluster import KMeans
dataset_1990_2019_1=dataset_1990_2019.drop(['Code','Year'],axis=1)


In [None]:
kmeans_model = KMeans(n_clusters=4, random_state=10).fit(dataset_1990_2019_1)


In [None]:
labels = kmeans_model.labels_
dataset_1990_2019['labels']=labels


In [None]:
sns.set_context('poster', font_scale=0.6)
fig = px.choropleth(dataset_1990_2019,locations='Code',color='labels',scope='world',color_continuous_scale=px.colors.sequential.GnBu,
                     range_color=(0,4),title='Clustering map',height=700,animation_frame ='Year')
fig.show()


In [None]:
dataset_1990_2019.groupby('labels').mean().T


In [None]:
#Increase and decrease of clusters
sns.set_context('poster', font_scale=0.6)
plt.figure(figsize=(22,12))
dataset_1990_2019.groupby(['labels','Year'])['labels'].count().plot.barh(color=['blue','red','blue','red','blue','red','blue','red'])


# Get the correlation


In [None]:
#get the correlation
sns.set_context('poster', font_scale=0.5)
plt.figure(figsize=(22,12))
cor = dataset.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
#Correlation with output variable
cor_target = abs(cor["Neurological disorders"])
#Selecting highly correlated features
relevant_features = cor_target #[cor_target>0.5]
relevant_features

In [None]:
# We see that the Neglected tropical diseases and malaria , Substance use disorders, Skin and subcutaneous diseases 
# Enteric infections,Respiratory infections and tuberculosis,
# having direct correlation with the output parameter

In [None]:
relevant_features = cor_target [cor_target>0.5]
relevant_features

# Most important Correlation with output variable

In [None]:
data1=dataset_1990_2019[['Enteric infections','Respiratory infections and tuberculosis', 'Neonatal disorders',
                          'Neurological disorders','Sense organ diseases','Mental disorders','Musculoskeletal disorders']]


In [None]:
plt.figure(figsize=(15,8))
sns.set_context('poster', font_scale=0.8)
sns.heatmap(data1.corr(),annot=True, cbar=False, cmap='Blues', fmt='.1f')


# Scatter Plote

In [None]:
plt.figure(figsize=(20,15))
sns.set_context('poster', font_scale=1.0)
color_codes = {0:'red', 1:'blue', 2:'yellow',3:'black'}
colors = [color_codes[x] for x in labels]
pd.plotting.scatter_matrix(data1[data1.columns[0:]], figsize=(50,50), color=colors, alpha=0.8, diagonal='kde')
plt.show()