In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
import plotly.graph_objs as go
import plotly.offline as py
import plotly.express as px

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Source: Kaggle Dataset Multi-Level causality relations of underlying of deaths with age, sex and country stratifications, Schicklin, C.

<h1 style="background-color:#008080; font-family:'Brush Script MT',cursive;color:white;font-size:200%; text-align:center;border-radius: 50% 20% / 10% 40%">The hazards of smoking and the benefits of cessation</h1>

"The hazards of smoking and the benefits of cessation: A critical summation of the epidemiological evidence in high-income countries"

Citation: Prabhat Jha - eLife 2020;9:e49979 DOI: 10.7554/eLife.49979

"In high-income countries, the biggest cause of premature death, defined as death before 70 years, is smoking of manufactured cigarettes. Smoking-related disease was responsible for about 41 million deaths in the United States, United Kingdom and Canada, cumulatively, from 1960 to 2020. Every million cigarettes smoked leads to one death in the US and Canada, but slightly more than one death in the UK. The 21st century hazards reveal that smokers who start smoking early in adult life and do not quit lose a decade of life expectancy versus non-smokers. Cessation, particularly before age 40 years, yields large reductions in mortality risk. Up to two-thirds of deaths among smokers are avoidable at non-smoking death rates, and former smokers have about only a quarter of the excess risk of death compared to current smokers. The gap between scientific and popular understanding of smoking hazards is surprisingly large."

https://elifesciences.org/articles/49979

![](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcS-WuJi1PmdyuobxqyQ0f_1Q3WWNtK6wBGwz3aKIxFEqDCpw6UWGWFM-e9mb-L1kCVALqQ&usqp=CAU)regencyhealthcare.in

In [None]:
df = pd.read_csv("/kaggle/input/multilevel-causality-relations-deaths-age-sex/multilevel_causality_deaths.csv")
print(df.shape)
df.head().style.set_properties(**{'background-color':'Aquamarine',
                                     'color': 'purple'})

In [None]:
df.isnull().sum()

In [None]:
#Code by Lindley Coetzee https://www.kaggle.com/lindleylawrence/cardano-gods-stats-starter

smoke_causality = df["concept_id"].str.contains("smoke")
smoke_causality_df = df[smoke_causality]
smoke_causality_df[:5]

In [None]:
#Code by Lindley Coetzee https://www.kaggle.com/lindleylawrence/cardano-gods-stats-starter

style.use("bmh")

smoke_causality_df.plot(figsize = (18, 8), kind = "bar", x = "concept_id_french", y = "causality_quality",
                 color= "gold", edgecolor='black')
plt.title("Deaths Caused by Smoking ")
plt.xlabel("French Concept")
plt.ylabel("Causality Quality")
plt.legend(['Smoking French Concept'], fontsize=15, loc='upper center')
plt.xticks(rotation = 45);

In [None]:
##Code by Taha07  https://www.kaggle.com/taha07/data-scientists-jobs-analysis-visualization/notebook

from wordcloud import WordCloud
from wordcloud import STOPWORDS
stopwords = set(STOPWORDS)
wordcloud = WordCloud(background_color = 'Green',
                      height =2000,
                      width = 2000
                     ).generate(str(df["concept_id_french"]))
plt.rcParams['figure.figsize'] = (12,12)
plt.axis("off")
plt.imshow(wordcloud)
plt.title("Deaths Causalities")
plt.show()

In [None]:
corr=df[df.columns.sort_values()].corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

fig = go.Figure(data=go.Heatmap(z=corr.mask(mask),
                                x=corr.columns.values,
                                y=corr.columns.values,
                                xgap=1, ygap=1,
                                colorscale="Rainbow",
                                colorbar_thickness=20,
                                colorbar_ticklen=3,
                                zmid=0),
                layout = go.Layout(title_text='Correlation Matrix', template='plotly_dark',
                height=900,
                xaxis_showgrid=False,
                yaxis_showgrid=False,
                yaxis_autorange='reversed'))
fig.show()

In [None]:
##Code by Taha07  https://www.kaggle.com/taha07/data-scientists-jobs-analysis-visualization/notebook

from wordcloud import WordCloud
from wordcloud import STOPWORDS
stopwords = set(STOPWORDS)
wordcloud = WordCloud(background_color = 'black',
                      colormap='Set3',
                      height =2000,
                      width = 2000
                     ).generate(str(smoke_causality_df["concept_id_french"]))
plt.rcParams['figure.figsize'] = (12,12)
plt.axis("off")
plt.imshow(wordcloud)
plt.title("Smoking French Concept")
plt.show()

In [None]:
smoke_causality_df.corr()
plt.figure(figsize=(10,4))
sns.heatmap(smoke_causality_df.corr(),annot=True,cmap='YlOrRd_r')
plt.show()

**<span style="color:#008080;">When A and B are correlated, there are at least four possible explanations</span>**

A causes B

B causes A

A and B are both caused by a third variable, C.

Chance (the correlation is spurious).

"So how was the causal link between cigarettes and lung cancer established? In the 50s and 60s a large number of studies came out that confirmed the correlation. Furthermore, studies also showed that heavier smokers suffered more cancer than lighter smokers, and that pipe smokers developed more lip cancer while cigarette smokers developed more lung cancer."

https://towardsdatascience.com/correlation-does-not-imply-causation-92e4832a6713

In [None]:
#Codes by https://www.kaggle.com/muhammadismail99/data-analysis-of-guf-pub-dataset/notebook

sns.countplot(data = df, x = 'level_of_cause', palette= ["teal", "Olive", "Chartreuse"])

plt.title('Level of Cause');

In [None]:
#Codes by https://www.kaggle.com/muhammadismail99/data-analysis-of-guf-pub-dataset/notebook

plt.figure(figsize=(10,8))
sns.countplot(data = smoke_causality_df, x = 'level_of_cause', palette= ["magenta", "purple"])

plt.title('Smoking Level of Cause');

In [None]:
plt.figure(figsize=(10,8))
g = sns.scatterplot(x='concept_id_french', y='total_yr_deaths_FRANCE', data=smoke_causality_df, hue='level_of_cause')
plt.xticks(rotation=45)
g.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=1)
plt.title("Total yearly Deaths caused by Smoking");

In [None]:
df['predecessors_array'].value_counts()[:10].plot.barh(title='Predecessors Array', figsize=(16,10),color= 'g');

In [None]:
df['successor_array'].value_counts()[:20].plot.barh(title='Sucessor Array', figsize=(8,6),color= 'r');

In [None]:
ax = df['comments'].value_counts()[:10].plot.barh(figsize=(16, 8), color='orange')
ax.set_title('Comments', size=18)
ax.set_ylabel('Comments', size=10)
ax.set_xlabel('Counts', size=10);

In [None]:
#https://www.kaggle.com/olgabelitskaya/parts-of-speech
plt.figure(figsize=(10,5))
sns.countplot(y="predecessors_array",data=smoke_causality_df,
             facecolor=(0,0,0,0),linewidth=5,
             edgecolor=sns.color_palette("summer"))
plt.title('Smoking Predecessor Array',
         fontsize=15);

In [None]:
#https://www.kaggle.com/olgabelitskaya/parts-of-speech
plt.figure(figsize=(10,5))
sns.countplot(y="successor_array",data=smoke_causality_df,
             facecolor=(0,0,0,0),linewidth=5,
             edgecolor=sns.color_palette("winter"))
plt.title('Smoking Successor Array',
         fontsize=15);

In [None]:
#https://www.kaggle.com/olgabelitskaya/parts-of-speech
plt.figure(figsize=(10,5))
sns.countplot(y="predecessors_array",data=smoke_causality_df,
             facecolor=(0,0,0,0),linewidth=5,
             edgecolor=sns.color_palette("summer"))
plt.title('Smoking Predecessor Array',
         fontsize=15);

"Scientific evidence for the causal relationship between smoking and lung cancer is based on studies of individuals and groups, studies in animals and humans, studies that are observational or experimental, studies in laboratories and communities, and studies in both underdeveloped and developed countries."

"The scientific evidence collected is applicable to both groups and individuals. The probability of causation, which is calculated based on the attributable fraction for the association between smoking and lung cancer, could be utilized as evidence to prove causality in individuals."

Citation:  Author Khang, Young-Ho, Published - 2015/05/31 - e2015026 "Causality between Smoking and Lung Cancer among Groups and Individuals: Addressing Issues in Tobacco Litigation in South Korea"
VL 37 - DOI 10.4178/epih/e2015026 - Journal Epidemiology and Health

https://www.researchgate.net/publication/277964448_Causality_between_Smoking_and_Lung_Cancer_among_Groups_and_Individuals_Addressing_Issues_in_Tobacco_Litigation_in_South_Korea