In [1]:
import pandas as pd
from scipy.stats import chi2_contingency

In [3]:
# Load the dataset
data = pd.read_csv("lungcapacity.csv")  # Replace with actual file name

data.head()


Unnamed: 0,LungCap(cc),Age( years),Height(inches),Smoke,Gender,Caesarean,No of children,Weight (kg)
0,6.475,6.0,62.1,,male,no,3,85.7
1,10.125,18.0,74.7,yes,female,no,0,98.75
2,9.55,16.0,69.7,no,female,yes,0,11.01
3,11.125,14.0,71.0,no,male,no,1,29.78
4,4.8,5.0,56.9,no,male,no,4,72.84


In [6]:
data['Smoke'].value_counts()

Smoke
no     660
yes     79
Name: count, dtype: int64

In [5]:
# Filter out the outlier
data = data[data['Smoke'].isin(['yes', 'no'])]

In [7]:
# Create a new column 'Lung Function' based on 'LungCap' values
data['Lung Function'] = data['LungCap(cc)'].apply(lambda x: 'Normal' if x >= 8 else 'Abnormal')


In [8]:
data.head()

Unnamed: 0,LungCap(cc),Age( years),Height(inches),Smoke,Gender,Caesarean,No of children,Weight (kg),Lung Function
1,10.125,18.0,74.7,yes,female,no,0,98.75,Normal
2,9.55,16.0,69.7,no,female,yes,0,11.01,Normal
3,11.125,14.0,71.0,no,male,no,1,29.78,Normal
4,4.8,5.0,56.9,no,male,no,4,72.84,Abnormal
5,6.225,11.0,58.7,no,female,no,1,24.76,Abnormal


In [9]:
# Create a contingency table
contingency_table = pd.crosstab(data['Smoke'], data['Lung Function'])
print(contingency_table)

Lung Function  Abnormal  Normal
Smoke                          
no                  344     316
yes                  29      50


In [10]:
chi2_contingency(contingency_table)

Chi2ContingencyResult(statistic=np.float64(6.102078276188827), pvalue=np.float64(0.013502299509155546), dof=1, expected_freq=array([[333.12584574, 326.87415426],
       [ 39.87415426,  39.12584574]]))

In [11]:
# Perform the chi-square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Print the results
print(f"Chi-square statistic: {chi2:.4f}")
print(f"P-value: {p:.4f}")
print(f"Degrees of freedom: {dof}")
print("\nExpected frequencies:")
print(expected)

Chi-square statistic: 6.1021
P-value: 0.0135
Degrees of freedom: 1

Expected frequencies:
[[333.12584574 326.87415426]
 [ 39.87415426  39.12584574]]


In [12]:
from scipy.stats import chisquare

# Observed frequencies
observed = data['Smoke'].value_counts().reindex(['no', 'yes'], fill_value=0)

observed

Smoke
no     660
yes     79
Name: count, dtype: int64

In [13]:
# Expected frequencies based on hypothesized distribution
expected = [0.7 * len(data), 0.3 * len(data)]  # Assuming 'no' is 0, 'yes' is 1
expected

[517.3, 221.7]

In [14]:
chisquare(f_obs=observed, f_exp=expected)

Power_divergenceResult(statistic=np.float64(131.21522005283848), pvalue=np.float64(2.221651695700653e-30))

In [15]:
# Perform the chi-square goodness of fit test
chi2, p = chisquare(f_obs=observed, f_exp=expected)

# Print the results
print(f"Chi-square statistic: {chi2:.4f}")
print(f"P-value: {p:.4f}")

# Interpretation
if p <= 0.05:
    print("The observed distribution of smokers differs significantly from the expected distribution.")
else:
    print("There is not enough evidence to conclude a difference between the observed and expected distributions.")

Chi-square statistic: 131.2152
P-value: 0.0000
The observed distribution of smokers differs significantly from the expected distribution.


In [None]:
from scipy.stats import chisquare

# Observed frequencies
observed = data['Smoke'].value_counts().reindex(['no', 'yes'], fill_value=0)

# Expected frequencies based on hypothesized distribution
expected = [0.7 * len(data), 0.3 * len(data)]  # Assuming 'no' is 0, 'yes' is 1

# Perform the chi-square goodness of fit test
chi2, p = chisquare(f_obs=observed, f_exp=expected)

# Print the results
print(f"Chi-square statistic: {chi2:.4f}")
print(f"P-value: {p:.4f}")

# Interpretation
if p <= 0.05:
    print("The observed distribution of smokers differs significantly from the expected distribution.")
else:
    print("There is not enough evidence to conclude a difference between the observed and expected distributions.")

In [16]:
import numpy as np
from scipy.stats import ttest_1samp

In [17]:
# Sample data (heights of students in the school)
sample_heights = np.array([65, 68, 70, 62, 66, 72, 67, 69, 71, 64])
# Population mean (national average height)
population_mean = 68

In [18]:
ttest_1samp(a=sample_heights, popmean=population_mean)

TtestResult(statistic=np.float64(-0.5921565254637865), pvalue=np.float64(0.5683217589713219), df=np.int64(9))

In [19]:
# Perform the one-sample t-test
t_statistic, p_value = ttest_1samp(a=sample_heights, popmean=population_mean)
# Print the results
print(f"T-statistic: {t_statistic:.4f}")
print(f"P-value: {p_value:.4f}")
# Interpretation
if p_value <= 0.05:
    print("Reject the null hypothesis: There's a significant difference in average height.")
else:
    print("Fail to reject the null hypothesis: No significant difference found.")


T-statistic: -0.5922
P-value: 0.5683
Fail to reject the null hypothesis: No significant difference found.


In [20]:
from scipy.stats import ttest_ind


# Data for plants treated with fertilizer A
heights_group_a = np.array([25, 28, 30, 27, 26])
# Data for plants treated with fertilizer B
heights_group_b = np.array([31, 33, 29, 32, 30])

In [21]:
ttest_ind(a=heights_group_a, b=heights_group_b)

TtestResult(statistic=np.float64(-3.4125007385087236), pvalue=np.float64(0.009189133869241969), df=np.float64(8.0))

In [22]:

# Perform the two-sample t-test
t_statistic, p_value = ttest_ind(a=heights_group_a, b=heights_group_b)
# Print the results
print(f"T-statistic: {t_statistic:.4f}")
print(f"P-value: {p_value:.4f}")
# Interpretation
if p_value <= 0.05:
    print("Reject the null hypothesis: The fertilizers have a significantly different effect.")
else:
    print("Fail to reject the null hypothesis: No significant difference found.")

T-statistic: -3.4125
P-value: 0.0092
Reject the null hypothesis: The fertilizers have a significantly different effect.


In [23]:
from scipy.stats import ttest_rel

# Test scores before the new teaching method
scores_before = np.array([70, 65, 75, 80, 72])

# Test scores after the new teaching method
scores_after = np.array([75, 70, 80, 85, 78])

In [24]:
ttest_rel(a=scores_before, b=scores_after)

TtestResult(statistic=np.float64(-26.000000000000004), pvalue=np.float64(1.3001305766721395e-05), df=np.int64(4))

In [25]:

# Perform the paired t-test
t_statistic, p_value = ttest_rel(a=scores_before, b=scores_after)
# Print the results
print(f"T-statistic: {t_statistic:.4f}")
print(f"P-value: {p_value:.4f}")
# Interpretation
if p_value <= 0.05:
    print("Reject the null hypothesis: The new teaching method has a significant effect.")
else:
    print("Fail to reject the null hypothesis: No significant difference found.")

T-statistic: -26.0000
P-value: 0.0000
Reject the null hypothesis: The new teaching method has a significant effect.


In [26]:
import pandas as pd
from scipy.stats import ttest_ind

# Load the dataset
data = pd.read_csv("lungcapacity.csv")

# Filter out rows with missing 'Smoke' values
data = data[data['Smoke'].notna()]

# Convert 'LungCap(cc)' to numeric, coercing errors to NaN
data['LungCap(cc)'] = pd.to_numeric(data['LungCap(cc)'], errors='coerce')

# Drop rows with NaN values in 'LungCap(cc)' after conversion
data.dropna(subset=['LungCap(cc)'], inplace=True)

# Create separate groups for smokers and non-smokers
smokers = data[data['Smoke'] == 'yes']['LungCap(cc)']
non_smokers = data[data['Smoke'] == 'no']['LungCap(cc)']

In [33]:
smokers.max()

np.float64(13.325)

In [35]:
non_smokers.max()

np.float64(14.675)

In [29]:
ttest_ind(a=smokers, b=non_smokers)

TtestResult(statistic=np.float64(2.7903581055455104), pvalue=np.float64(0.005402936077181694), df=np.float64(727.0))

In [30]:


# Perform the two-sample t-test
t_statistic, p_value = ttest_ind(a=smokers, b=non_smokers, equal_var=False)  # Assuming unequal variances

# Print the results
print(f"T-statistic: {t_statistic:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpretation
if p_value <= 0.05:
    print("Reject the null hypothesis: There's a significant difference in lung capacity between smokers and non-smokers.")
else:
    print("Fail to reject the null hypothesis: No significant difference found.")

T-statistic: 3.7472
P-value: 0.0003
Reject the null hypothesis: There's a significant difference in lung capacity between smokers and non-smokers.
