### Given the Drug dataset determine whether Blood Pressure and Drug are related considering a significance level of 0.05.

In [69]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import chi2, chisquare
from scipy.stats import chi2_contingency
import warnings
warnings.filterwarnings('ignore')

In [70]:
df = pd.read_csv('drug200.csv') # upload the dataset

In [71]:
df.head() # Check first five rows of the dataset.

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY


In [72]:
df.shape # Shape of the data

(200, 6)

In [73]:
df.info() # data infomration

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


In [74]:
# value counts for feature BP
df['BP'].value_counts()

BP
HIGH      77
LOW       64
NORMAL    59
Name: count, dtype: int64

In [75]:
# value counts for feature Drug
df['Drug'].value_counts()

Drug
drugY    91
drugX    54
drugA    23
drugC    16
drugB    16
Name: count, dtype: int64

In [76]:
# Prepare a contigency table with margins = True
observed_freq = pd.crosstab(df['BP'], df['Drug'], margins = True)
observed_freq

Drug,drugA,drugB,drugC,drugX,drugY,All
BP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HIGH,23,16,0,0,38,77
LOW,0,0,16,18,30,64
NORMAL,0,0,0,36,23,59
All,23,16,16,54,91,200


In [77]:
# Prepare a contigency table without row total and colum total
observed_freq = pd.crosstab(df['BP'], df['Drug'])
observed_freq

Drug,drugA,drugB,drugC,drugX,drugY
BP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HIGH,23,16,0,0,38
LOW,0,0,16,18,30
NORMAL,0,0,0,36,23


### Frame Hypothesis

#### H0 : BP and drugs are independent (BP does not influence which drug is prescribed))

#### H1 : There is a relation between BP and drive Drugs (BP influences drug choice)

In [78]:
result = chi2_contingency(observed_freq) # the chi2_contigency function returns four values (chi2 statistic, p-value, dof, expected frequency) in a tuple.
result

Chi2ContingencyResult(statistic=np.float64(143.23287184969993), pvalue=np.float64(5.041733414466518e-27), dof=8, expected_freq=array([[ 8.855,  6.16 ,  6.16 , 20.79 , 35.035],
       [ 7.36 ,  5.12 ,  5.12 , 17.28 , 29.12 ],
       [ 6.785,  4.72 ,  4.72 , 15.93 , 26.845]]))

### Extract the individual values

#### Chi-square test statistic

In [79]:
Chi_square_statistic  = result[0]
print('Chi-square statistic:', Chi_square_statistic)

Chi-square statistic: 143.23287184969993


#### p-value

In [80]:
p_value  = result[1]
print('P-Value:', p_value)

P-Value: 5.041733414466518e-27


#### Degrees of freedom = (rows − 1) × (columns − 1)

In [81]:
dof = result[2]
print('Degrees of freedom:', dof)

Degrees of freedom: 8


#### Expected frequency

In [82]:
expected_freq = result[3]
print('expected frequency: ', expected_freq)

expected frequency:  [[ 8.855  6.16   6.16  20.79  35.035]
 [ 7.36   5.12   5.12  17.28  29.12 ]
 [ 6.785  4.72   4.72  15.93  26.845]]


### Calculate critical chi-square statstics

In [83]:
# critical chi-square statstics

alpha = 0.05
critical_chi_square = chi2.ppf(1 -alpha, dof)
critical_chi_square 

np.float64(15.50731305586545)

### Conclusion

In [84]:
if Chi_square_statistic > critical_chi_square:
    print('Reject Null Hypothesis')
else:
    print('Fail to reject Null Hypothesis')

Reject Null Hypothesis


In [85]:
print('There exsist a relationship between BP and Drug')

There exsist a relationship between BP and Drug


### Final Conclusion

#### There exists a statistically significant relationship between Blood Pressure and Drug type. This means the type of drug prescribed depends on the patient's BP category (HIGH, LOW, NORMAL).

In [86]:
# End