# Hypothesis testing in titanic dataset using p-values in context of single sample t-test and Independent 2 sample t-test

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
from scipy.stats import t

In [2]:
test=pd.read_csv("C:\\Users\\dell\\Downloads\\test.csv")
train=pd.read_csv("C:\\Users\\dell\\Downloads\\train.csv")

In [3]:
final_df = pd.concat([train,test])

In [4]:
final_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
final_df.shape

(1309, 12)

In [6]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


# Single Sample t-test

In [7]:
pop=final_df['Age'].dropna()

In [8]:
pop_mean=35

In [9]:
pop_mean

35

In [10]:
sample_size=25

In [11]:
sample_age=pop.sample(sample_size).values #sample_size=25

In [12]:
sample_age

array([37. ,  9. , 27. , 30.5, 26. , 36. , 27. , 27. , 22. , 18.5, 24. ,
       28. , 31. , 19. , 28. , 27. , 28. , 48. , 32. ,  4. , 21. , 18. ,
        6. , 24. , 24. ])

In [13]:
sample_mean=sample_age.mean()

In [14]:
sample_mean

24.88

In [15]:
sample_std=sample_age.std()

In [16]:
sample_std

9.308361832245241

# step 1:-Formulate Null and Alternate hypothesis
- H0 -> The mean age is 35
- H1 -> The mean is less than 35

# step 2:-select significant level
  - alpha=0.05

# step 3:-check assumption

In [17]:
from scipy.stats import shapiro

statistics,pvalue = shapiro(sample_age)

if pvalue>0.05:
    print("Normal Distibution")
else:
    print("Another Distribution")

Normal Distibution


# step 4:-Decide test
- single sample t-test

# step 5:-state the relavant test statistics

In [18]:
tvalue=(sample_mean-pop_mean)/(sample_std/np.sqrt(sample_size))

In [19]:
tvalue

-5.435972613861631

In [20]:
import scipy.stats as stats

t_statistic, p_value = stats.ttest_1samp(sample_age, pop_mean)

print("t-statistic:", t_statistic)
print("p-value:", p_value/2)

t-statistic: -5.32614366388173
p-value: 9.119286647515515e-06


# step:-6 Reject or not reject null hypothesis

In [21]:
if p_value<0.05:
    print("Reject null hyphotesis")
else:
    print("Not reject null hyphothesis")

Reject null hyphotesis


# Independent 2 sample t-test

In [22]:
pop_male = final_df[final_df['Sex'] == 'male']['Age'].dropna()
pop_female = final_df[final_df['Sex'] == 'female']['Age'].dropna()

# step 1:-Formulate Null and Alternate hypothesis
- H0 - Mean age of male and female are similar
- H1 - Mean age of male is higher than female


# step 2:-select significant level
- alpha=0.05

In [23]:
sample_male=pop_male.sample(25)
sample_female=pop_female.sample(25)

# step 3:-check assumption

In [24]:
from scipy.stats import shapiro

# Perform the Shapiro-Wilk test for both desktop and mobile users
shapiro_male = shapiro(sample_male)
shapiro_female = shapiro(sample_female)

print("Shapiro-Wilk test for desktop users:", shapiro_male)
print("Shapiro-Wilk test for mobile users:", shapiro_female)

Shapiro-Wilk test for desktop users: ShapiroResult(statistic=0.9439946413040161, pvalue=0.18297752737998962)
Shapiro-Wilk test for mobile users: ShapiroResult(statistic=0.9490419030189514, pvalue=0.23853962123394012)


# step 4:-Decide test
- Independent 2 sample t-test

# Step 5:Check variance of both random sampling

In [25]:
from scipy.stats import levene

# Perform Levene's test
test_statistic,pvalue = levene(sample_male, sample_female)
if pvalue>0.05:
    print("Variance are same")
else:
    print("Variance are not same")

Variance are not same


# step 6:-state the relavant test statistics

In [26]:
import scipy.stats as stats

t_statistic, p_value = stats.ttest_ind(sample_male, sample_female)

print("t-statistic:", t_statistic)
print("p-value:", p_value/2)

t-statistic: 0.09716427470314684
p-value: 0.46150026060016125


In [27]:
if p_value<0.05:
    print("Reject null hyphothesis")
else:
    print("Not Reject null hyphotesis")

Not Reject null hyphotesis
