# Pengujian Hipotesis

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
from scipy.stats import t

In [2]:
insurance = pd.read_csv('data/insurance.csv')
insurance

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


## Hipotesis #1 : Tagihan kesehatan perokok lebih tinggi daripada tagihan kesehatan non perokok

- H0 : Tagihan perokok sama dengan tagihan non perokok
- H1 : Tagihan perokok lebih besar dari tagihan non perokok

In [3]:
alpha = 0.05

In [4]:
perokok = insurance[insurance['smoker'] == 'yes']
perokok

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
11,62,female,26.290,0,yes,southeast,27808.72510
14,27,male,42.130,0,yes,southeast,39611.75770
19,30,male,35.300,0,yes,southwest,36837.46700
23,34,female,31.920,1,yes,northeast,37701.87680
...,...,...,...,...,...,...,...
1313,19,female,34.700,2,yes,southwest,36397.57600
1314,30,female,23.655,3,yes,northwest,18765.87545
1321,62,male,26.695,0,yes,northeast,28101.33305
1323,42,female,40.370,2,yes,southeast,43896.37630


In [5]:
non_perokok = insurance[insurance['smoker'] == 'no']
non_perokok

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
5,31,female,25.740,0,no,southeast,3756.62160
...,...,...,...,...,...,...,...
1332,52,female,44.700,3,no,southwest,11411.68500
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350


Karena jumlah data non perokok lebih besar daripada jumlah perokok, maka sample data non perokok perlu dikurangi sehingga jumlah data kedua data sama

In [16]:
tagihan_perokok = perokok['charges']
tagihan_non_perokok = non_perokok['charges'].head(len(tagihan_perokok))

In [17]:
stat, p_value = ttest_ind(
    a=np.array(tagihan_perokok), 
    b=np.array(tagihan_non_perokok),
    alternative='greater',
    equal_var=False)

In [18]:
print(f'Statistics = {stat}, p-value = {p_value}') 

Statistics = 29.96693824854113, p-value = 6.943480474910414e-106


In [19]:
if p_value > alpha:
    print('Gagal tolak H0') 
else:
    print('Tolak H0')

Tolak H0


## Hipotesis #2 : Tagihan  kesehatan  dengan  BMI  diatas  25  lebih  tinggi  daripada  tagihan  kesehatan dengan BMI dibawah 25

In [82]:
under_25 = insurance[insurance['bmi'] <= 25]
under_25

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
3,33,male,22.705,0,no,northwest,21984.47061
15,19,male,24.600,1,no,southwest,1837.23700
17,23,male,23.845,0,no,northeast,2395.17155
26,63,female,23.085,0,no,northeast,14451.83515
28,23,male,17.385,1,no,northwest,2775.19215
...,...,...,...,...,...,...,...
1304,42,male,24.605,2,yes,northeast,21259.37795
1306,29,female,21.850,0,yes,northeast,16115.30450
1314,30,female,23.655,3,yes,northwest,18765.87545
1316,19,female,20.600,0,no,southwest,1731.67700


In [83]:
above_25 = insurance[insurance['bmi'] > 25]
above_25

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.90,0,yes,southwest,16884.9240
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.00,3,no,southeast,4449.4620
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
...,...,...,...,...,...,...,...
1333,50,male,30.97,3,no,northwest,10600.5483
1334,18,female,31.92,0,no,northeast,2205.9808
1335,18,female,36.85,0,no,southeast,1629.8335
1336,21,female,25.80,0,no,southwest,2007.9450


Karena jumlah data pada di atas 25 lebih banyak, maka perlu dikurangi agar datanya sama.

In [85]:
tagihan_under_25bmi = under_25['charges']
tagihan_above_25bmi = above_25['charges'].head(len(tagihan_under_25bmi))

- H0 : Tagihan kesehatan yang mempunyai BMI >= 25 sama dengan tagihan kesehatan yang mempunyai BMI < 25
- H1 : Tagihan kesehatan yang mempunyai BMI >= 25 lebih besar dari tagihan kesehatan yang mempunyai mempunyai BMI < 25

In [86]:
stat, p_value = ttest_ind(
    a=np.array(tagihan_above_25bmi), 
    b=np.array(tagihan_under_25bmi),
    alternative='greater',
    equal_var=False)

In [87]:
print('Statistics = %.4f, p-value = %.4f' % (stat, p_value)) 

Statistics = 3.8734, p-value = 0.0001


In [88]:
if p_value > alpha:
    print('Gagal tolak H0') 
else:
    print('Tolak H0')

Tolak H0


# Hipotesis #3 : Tagihan kesehatan laki-laki lebih besar dari perempuan

In [90]:
perempuan = insurance[insurance['sex'] == 'female']
perempuan

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.90,0,yes,southwest,16884.92400
5,31,female,25.74,0,no,southeast,3756.62160
6,46,female,33.44,1,no,southeast,8240.58960
7,37,female,27.74,3,no,northwest,7281.50560
9,60,female,25.84,0,no,northwest,28923.13692
...,...,...,...,...,...,...,...
1332,52,female,44.70,3,no,southwest,11411.68500
1334,18,female,31.92,0,no,northeast,2205.98080
1335,18,female,36.85,0,no,southeast,1629.83350
1336,21,female,25.80,0,no,southwest,2007.94500


In [91]:
laki = insurance[insurance['sex'] == 'male']
laki

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
8,37,male,29.830,2,no,northeast,6406.41070
...,...,...,...,...,...,...,...
1324,31,male,25.935,1,no,northwest,4239.89265
1325,61,male,33.535,0,no,northeast,13143.33665
1327,51,male,30.030,1,no,southeast,9377.90470
1329,52,male,38.600,2,no,southwest,10325.20600


In [92]:
tagihan_perempuan = perempuan['charges']
tagihan_laki = laki['charges'].head()

- H0 : Tagihan kesehatan laki-laki sama dengan tagihan kesehatan perempuan
- H1 : Tagihan kesehatan laki-laki lebih besar dari tagihan kesehatan perempuan

In [94]:
stat, p_value = ttest_ind(
    a=np.array(tagihan_laki), 
    b=np.array(tagihan_perempuan),
    alternative='greater',
    equal_var=False)

In [95]:
print('Statistics = %.4f, p-value = %.4f' % (stat, p_value)) 

Statistics = -1.3280, p-value = 0.8735


In [96]:
if p_value > alpha:
    print('Gagal tolak H0') 
else:
    print('Tolak H0')

Gagal tolak H0
