In [1]:
# Importing the Library
import pandas as pd
import numpy as  np
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.stats.multicomp as multi
warnings.filterwarnings('ignore')
import scipy

In [2]:
data=pd.read_csv("./data/gapmiderData.csv")
#Our Interest of Columns are country, alcconsumption, co2emissions, lifeexpectancy
dataInterest=data[["country","alcconsumption","co2emissions","lifeexpectancy"]]
dataInterest["alcconsumption"]=pd.to_numeric(dataInterest["alcconsumption"],errors='coerce')
dataInterest["co2emissions"]=pd.to_numeric(dataInterest["co2emissions"],'coerce')
dataInterest["lifeexpectancy"]=pd.to_numeric(dataInterest["lifeexpectancy"],'coerce')
dataInterest.shape

(213, 4)

In [3]:
dataInterest=dataInterest.dropna()
print(dataInterest.shape)
# Next we will remove out the countries that have alcohol consumption less than 1 litres to make the analysis more significant
dataInterest=dataInterest[dataInterest["alcconsumption"]>1]

(172, 4)


In [4]:
print("Minimum value for Alcohol Consumption variable is ",dataInterest["alcconsumption"].min())
print("Maximum value for Alcohol Consumption variable is ",dataInterest["alcconsumption"].max())

Minimum value for Alcohol Consumption variable is  1.03
Maximum value for Alcohol Consumption variable is  23.01


In [5]:
#Converting Alcohol column to categorical variable
# We convert over Alcohol Quantitative variable to Categorical, we divide the data into following three categories
# We consider less than 5 litre consumption as veryLow and 5-15 litres alcohol consumption as medium alcohol consumption and 
# 15-24 litres alcohol consumption as high
# after conversion we will check whether low, medium or high alcohol consumption have any affect on life expectancy
dataInterest["AlcoholConsumption"]=pd.cut(dataInterest["alcconsumption"],[0,5,15,24])
dataInterest["AlcoholConsumption"]=dataInterest["AlcoholConsumption"].astype("category")

In [6]:
dataInterest["AlcoholConsumption"].value_counts()

(5, 15]     84
(0, 5]      50
(15, 24]    11
Name: AlcoholConsumption, dtype: int64

In [7]:
print("Minimum value for Life Expectancy variable is ",dataInterest["lifeexpectancy"].min())
print("Maximum value for Life Expectancy variable is ",dataInterest["lifeexpectancy"].max())

Minimum value for Life Expectancy variable is  47.794
Maximum value for Life Expectancy variable is  83.394


In [21]:
# So we divide our life expectancy variables as well into two category
# from 47 years to 65 years we say the life expectancy as low
# and from 65 years to 84 years we say the life expectancy as high
dataInterest["LifeExpectancy"]=pd.cut(dataInterest["lifeexpectancy"],[47,65,84])
dataInterest["LifeExpectancy"]=dataInterest["LifeExpectancy"].astype("category")

In [22]:
dataInterest["LifeExpectancy"].value_counts()

(65, 84]    105
(47, 65]     40
Name: LifeExpectancy, dtype: int64

#### Performing Chi-square Test

In [23]:
ct=pd.crosstab(dataInterest["LifeExpectancy"],dataInterest["AlcoholConsumption"])

In [24]:
colsum=ct.sum(axis=0)

In [25]:
colpct=ct/colsum

In [26]:
print(colpct)

AlcoholConsumption  (0, 5]   (5, 15]  (15, 24]
LifeExpectancy                                
(47, 65]              0.42  0.214286  0.090909
(65, 84]              0.58  0.785714  0.909091


In [27]:
results=scipy.stats.chi2_contingency(ct)

In [29]:
print("Chi-squre value, p-value,expected counts")
results

Chi-squre value, p-value,expected counts


(8.678164811379096,
 0.013048495937467864,
 2,
 array([[13.79310345, 23.17241379,  3.03448276],
        [36.20689655, 60.82758621,  7.96551724]]))

In [30]:
# So the p-value is greater than 0.05 and hence we cannot reject the hypothesis, 
# so we can conclude there is no association between alcohol consumption and life expectancy
# as we failed to reject the null hypothesis we don't perform the post hoc test