In [None]:
# ===========================
# Step 1: Load Dataset
# ===========================
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np

# File path
file_path = '/content/drive/My Drive/datasets/stroke_prediction_dataset.csv'
data = pd.read_csv(file_path)

print("Full dataset shape:", data.shape)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Full dataset shape: (15000, 22)


In [None]:
print(data.columns)

Index(['Patient ID', 'Patient Name', 'Age', 'Gender', 'Hypertension',
       'Heart Disease', 'Marital Status', 'Work Type', 'Residence Type',
       'Average Glucose Level', 'Body Mass Index (BMI)', 'Smoking Status',
       'Alcohol Intake', 'Physical Activity', 'Stroke History',
       'Family History of Stroke', 'Dietary Habits', 'Stress Levels',
       'Blood Pressure Levels', 'Cholesterol Levels', 'Symptoms', 'Diagnosis',
       'Hypertension_x_Age'],
      dtype='object')


In [None]:
import pandas as pd
from scipy import stats
from scipy.stats import chi2_contingency

# Example: Load your dataset
# data = pd.read_csv("your_dataset.csv")

# ---------------------------
# 1. Hypothesis Test for Age
# ---------------------------
# H0: Age has no effect on stroke
# H1: Older age increases stroke risk

# Separate Age by Stroke outcome
age_stroke = data[data['Diagnosis'] == 1]['Age']
age_no_stroke = data[data['Diagnosis'] == 0]['Age']

# Independent t-test
t_stat, p_val = stats.ttest_ind(age_stroke, age_no_stroke)
print("Age t-test statistic:", t_stat)
print("Age p-value:", p_val)

# ---------------------------
# 2. Hypothesis Test for Hypertension
# ---------------------------
# H0: Hypertension not associated with stroke
# H1: Hypertension increases stroke risk

# Contingency table
table = pd.crosstab(data['Hypertension'], data['Diagnosis'])

# Chi-square test
chi2, p, dof, expected = chi2_contingency(table)
print("\nHypertension Chi-square statistic:", chi2)
print("Hypertension p-value:", p)


Age t-test statistic: nan
Age p-value: nan

Hypertension Chi-square statistic: 1.955652510748815
Hypertension p-value: 0.16197903877847364


  return f(*args, **kwargs)


In [None]:
# Count of Stroke vs Non-Stroke
print(data['Diagnosis'].value_counts())

# Check missing Age values
print(data['Age'].isna().sum())


Diagnosis
No Stroke    7532
Stroke       7468
Name: count, dtype: int64
0


In [None]:
# Check for missing or non-numeric Age values
print(data['Age'].isna().sum())  # should be 0
print(data['Age'].dtype)         # should be numeric
print(data['Age'].describe())    # quick overview


0
int64
count    15000.000000
mean        54.035667
std         21.063111
min         18.000000
25%         36.000000
50%         54.000000
75%         72.000000
max         90.000000
Name: Age, dtype: float64


In [None]:
import pandas as pd
from scipy import stats
from scipy.stats import chi2_contingency, fisher_exact

# Assuming 'data' is your DataFrame

# ---------------------------
# 1. Hypothesis Test for Age
# ---------------------------
# H0: Age has no effect on stroke
# H1: Older age increases stroke risk

age_stroke = data[data['Diagnosis'] == 1]['Age']
age_no_stroke = data[data['Diagnosis'] == 0]['Age']

# Use Welch's t-test
t_stat, p_val = stats.ttest_ind(age_stroke, age_no_stroke, equal_var=False)
print("Age t-test statistic:", t_stat)
print("Age p-value:", p_val)

# ---------------------------
# 2. Hypothesis Test for Hypertension
# ---------------------------
# H0: Hypertension not associated with stroke
# H1: Hypertension significantly increases stroke risk

# Contingency table
table = pd.crosstab(data['Hypertension'], data['Diagnosis'])

# Chi-square test
chi2, p, dof, expected = chi2_contingency(table)
print("\nHypertension Chi-square statistic:", chi2)
print("Hypertension p-value:", p)

# If table is 2x2 and small counts, optionally use Fisher's Exact Test
if table.shape == (2, 2):
    oddsratio, p_fisher = fisher_exact(table)
    print("Hypertension Fisher Exact p-value:", p_fisher)


Age t-test statistic: nan
Age p-value: nan

Hypertension Chi-square statistic: 1.955652510748815
Hypertension p-value: 0.16197903877847364
Hypertension Fisher Exact p-value: 0.15679988494409117


  return f(*args, **kwargs)


In [None]:
print(data['Diagnosis'].unique())
print(data['Diagnosis'].value_counts())
print(data['Diagnosis'].dtype)


['Stroke' 'No Stroke']
Diagnosis
No Stroke    7532
Stroke       7468
Name: count, dtype: int64
object


In [None]:
# If Diagnosis is string, convert to 0/1
data['Diagnosis'] = data['Diagnosis'].map({'No Stroke': 0, 'Stroke': 1})


In [None]:
age_stroke = data[data['Diagnosis'] == 1]['Age']
age_no_stroke = data[data['Diagnosis'] == 0]['Age']

print(len(age_stroke), len(age_no_stroke))  # should be non-zero numbers


7468 7532


In [None]:
from scipy import stats

t_stat, p_val = stats.ttest_ind(age_stroke, age_no_stroke, equal_var=False)
print("Age t-test statistic:", t_stat)
print("Age p-value:", p_val)


Age t-test statistic: -0.2522454350531318
Age p-value: 0.8008548176808882


In [None]:
import statsmodels.formula.api as smf

# Ensure Diagnosis and Hypertension are numeric
data['Diagnosis'] = data['Diagnosis'].astype(int)
data['Hypertension'] = data['Hypertension'].astype(int)

# Logistic regression with interaction term
model = smf.logit('Diagnosis ~ Age * Hypertension', data=data).fit()
print(model.summary())

# Look for the p-value of Age:Hypertension
p_value_interaction = model.pvalues['Age:Hypertension']
print("Interaction p-value:", p_value_interaction)


Optimization terminated successfully.
         Current function value: 0.693069
         Iterations 3
                           Logit Regression Results                           
Dep. Variable:              Diagnosis   No. Observations:                15000
Model:                          Logit   Df Residuals:                    14996
Method:                           MLE   Df Model:                            3
Date:                Mon, 01 Sep 2025   Pseudo R-squ.:               9.957e-05
Time:                        04:56:54   Log-Likelihood:                -10396.
converged:                       True   LL-Null:                       -10397.
Covariance Type:            nonrobust   LLR p-value:                    0.5579
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            0.0125      0.052      0.242      0.809      -0.089       0.114
Age        

In [None]:
import pandas as pd
import statsmodels.formula.api as smf
from scipy import stats
from scipy.stats import chi2_contingency, fisher_exact

# 1️⃣ Ensure data is numeric
# Check if Diagnosis is already numeric before mapping
if not pd.api.types.is_numeric_dtype(data['Diagnosis']):
    data['Diagnosis'] = data['Diagnosis'].map({'No Stroke': 0, 'Stroke': 1})
data['Hypertension'] = data['Hypertension'].astype(int)


# 2️⃣ Age vs Stroke (t-test)
age_stroke = data[data['Diagnosis'] == 1]['Age']
age_no_stroke = data[data['Diagnosis'] == 0]['Age']

t_stat, p_val = stats.ttest_ind(age_stroke, age_no_stroke, equal_var=False)
print("Age t-test statistic:", t_stat)
print("Age p-value:", p_val)

# 3️⃣ Hypertension vs Stroke (Chi-square)
table = pd.crosstab(data['Hypertension'], data['Diagnosis'])
chi2, p, dof, expected = chi2_contingency(table)
print("\nHypertension Chi-square statistic:", chi2)
print("Hypertension p-value:", p)

# Optional Fisher Exact for small counts
if table.shape == (2, 2):
    oddsratio, p_fisher = fisher_exact(table)
    print("Hypertension Fisher Exact p-value:", p_fisher)

# 4️⃣ Interaction: Age × Hypertension (Logistic Regression)
data['Hypertension_x_Age'] = data['Hypertension'] * data['Age']
model = smf.logit('Diagnosis ~ Age * Hypertension', data=data).fit()
print("\nLogistic Regression Summary for Interaction:")
print(model.summary())

# Get interaction p-value
p_interaction = model.pvalues['Age:Hypertension']
print("Interaction p-value:", p_interaction)

Age t-test statistic: -0.2522454350531318
Age p-value: 0.8008548176808882

Hypertension Chi-square statistic: 1.955652510748815
Hypertension p-value: 0.16197903877847364
Hypertension Fisher Exact p-value: 0.15679988494409117
Optimization terminated successfully.
         Current function value: 0.693069
         Iterations 3

Logistic Regression Summary for Interaction:
                           Logit Regression Results                           
Dep. Variable:              Diagnosis   No. Observations:                15000
Model:                          Logit   Df Residuals:                    14996
Method:                           MLE   Df Model:                            3
Date:                Mon, 01 Sep 2025   Pseudo R-squ.:               9.957e-05
Time:                        05:18:54   Log-Likelihood:                -10396.
converged:                       True   LL-Null:                       -10397.
Covariance Type:            nonrobust   LLR p-value:                    0.

In [None]:
print(data['Diagnosis'].unique())

[nan]


In [None]:
# See unique symptoms and counts
print(data['Symptoms'].value_counts())


Symptoms
Difficulty Speaking                                         268
Numbness                                                    254
Headache                                                    253
Blurred Vision                                              251
Seizures                                                    249
                                                           ... 
Headache, Blurred Vision, Severe Fatigue, Headache            1
Severe Fatigue, Confusion, Headache, Difficulty Speaking      1
Dizziness, Numbness, Headache, Dizziness, Seizures            1
Weakness, Headache, Weakness, Severe Fatigue, Numbness        1
Confusion, Dizziness, Severe Fatigue, Dizziness               1
Name: count, Length: 5786, dtype: int64


In [None]:
from scipy.stats import chi2_contingency

# Create contingency table
table = pd.crosstab(data['Symptoms'], data['Diagnosis'])

# Chi-square test
chi2, p, dof, expected = chi2_contingency(table)
print("Symptoms Chi-square statistic:", chi2)
print("Symptoms p-value:", p)


Symptoms Chi-square statistic: 5828.6749874939105
Symptoms p-value: 0.34045775687694946


In [None]:
import pandas as pd
from scipy import stats
from scipy.stats import chi2_contingency, fisher_exact
import statsmodels.formula.api as smf

# Ensure Diagnosis is numeric (it should already be from a previous step)
# data['Diagnosis'] = data['Diagnosis'].map({'No Stroke': 0, 'Stroke': 1}) # Removed redundant mapping

results = []

# Continuous variables
# Removed 'Blood Pressure Levels' and 'Cholesterol Levels' as they are not numeric
continuous_vars = ['Age', 'Body Mass Index (BMI)', 'Average Glucose Level']

for var in continuous_vars:
    # Add a check for column existence
    if var not in data.columns:
        print(f"Warning: Column '{var}' not found in the DataFrame. Skipping.")
        continue

    group1 = data[data['Diagnosis'] == 1][var].dropna() # Added dropna to handle potential NaNs in continuous variables
    group0 = data[data['Diagnosis'] == 0][var].dropna() # Added dropna

    # Add a check for sufficient data points in groups
    if len(group1) < 2 or len(group0) < 2:
        print(f"Warning: Not enough data points in both groups for variable '{var}'. Skipping t-test.")
        results.append([var, 't-test', None, None])
        continue

    t_stat, p_val = stats.ttest_ind(group1, group0, equal_var=False)
    results.append([var, 't-test', t_stat, p_val])

# Categorical variables
categorical_vars = ['Gender','Marital Status','Work Type','Residence Type',
                    'Hypertension','Heart Disease','Smoking Status','Alcohol Intake',
                    'Physical Activity','Dietary Habits','Stress Levels']

for var in categorical_vars:
    # Add a check for column existence
    if var not in data.columns:
        print(f"Warning: Column '{var}' not found. Skipping.")
        continue

    table = pd.crosstab(data[var], data['Diagnosis'])

    # Add a check if table is empty or has only one row/column
    if table.empty or table.shape[0] <= 1 or table.shape[1] <= 1:
         print(f"Warning: Contingency table for variable '{var}' is not suitable for Chi-square test. Skipping.")
         results.append([var, 'Chi-square', None, None])
         continue

    chi2, p, dof, expected = chi2_contingency(table)
    results.append([var, 'Chi-square', chi2, p])

# Interaction: Age x Hypertension
# Add checks for column existence before creating interaction term
if 'Age' in data.columns and 'Hypertension' in data.columns:
    # Ensure Hypertension is numeric before creating interaction term
    if not pd.api.types.is_numeric_dtype(data['Hypertension']):
        print("Warning: 'Hypertension' column is not numeric. Skipping interaction.")
        results.append(['Age x Hypertension', 'Logistic Regression', None, None])
    else:
        data['Hypertension_x_Age'] = data['Hypertension'] * data['Age']
        # Add try-except for model fitting in case of issues
        try:
            model = smf.logit('Diagnosis ~ Age * Hypertension', data=data).fit(disp=False)
            interaction_coef = model.params['Age:Hypertension']
            interaction_p = model.pvalues['Age:Hypertension']
            results.append(['Age x Hypertension', 'Logistic Regression', interaction_coef, interaction_p])
        except Exception as e:
            print(f"Warning: Could not fit logistic regression model for Age x Hypertension interaction. Error: {e}. Skipping.")
            results.append(['Age x Hypertension', 'Logistic Regression', None, None])
else:
     print("Warning: 'Age' or 'Hypertension' column not found. Skipping Age x Hypertension interaction.")
     results.append(['Age x Hypertension', 'Logistic Regression', None, None])


# Symptoms (optional: can test all unique combinations or individual symptoms)
# Here we just include Symptoms as one variable
if 'Symptoms' in data.columns:
    table = pd.crosstab(data['Symptoms'], data['Diagnosis'])
    # Add a check if table is empty or has only one row/column
    if table.empty or table.shape[0] <= 1 or table.shape[1] <= 1:
        print("Warning: Contingency table for 'Symptoms' is not suitable for Chi-square test. Skipping.")
        results.append(['Symptoms', 'Chi-square', None, None])
    else:
        chi2, p, dof, expected = chi2_contingency(table)
        results.append(['Symptoms', 'Chi-square', chi2, p])
else:
    print("Warning: 'Symptoms' column not found. Skipping.")
    results.append(['Symptoms', 'Chi-square', None, None])


# Create results DataFrame
results_df = pd.DataFrame(results, columns=['Variable', 'Test', 'Statistic/Coef', 'p-value'])
results_df['Significant'] = results_df['p-value'] < 0.05
print(results_df)

                 Variable                 Test  Statistic/Coef   p-value  \
0                     Age               t-test       -0.252245  0.800855   
1   Body Mass Index (BMI)               t-test       -1.247345  0.212291   
2   Average Glucose Level               t-test        1.913908  0.055651   
3                  Gender           Chi-square        0.015087  0.902244   
4          Marital Status           Chi-square        5.341511  0.069200   
5               Work Type           Chi-square        2.452822  0.483878   
6          Residence Type           Chi-square        0.000939  0.975557   
7            Hypertension           Chi-square        1.955653  0.161979   
8           Heart Disease           Chi-square        0.046380  0.829486   
9          Smoking Status           Chi-square        0.933488  0.627041   
10         Alcohol Intake           Chi-square        2.637746  0.450911   
11      Physical Activity           Chi-square        0.359508  0.835476   
12         D

In [None]:
from scipy.stats import chi2_contingency, fisher_exact
import pandas as pd # Import pandas

# Check if 'AlcoholIntake' column exists before proceeding
if 'AlcoholIntake' in data.columns:
    # Check unique values
    print(data['AlcoholIntake'].value_counts())

    # Create contingency table
    table = pd.crosstab(data['AlcoholIntake'], data['Diagnosis'])

    # Chi-square test
    chi2, p, dof, expected = chi2_contingency(table)
    print("Alcohol Intake Chi-square statistic:", chi2)
    print("Alcohol Intake p-value:", p)

    # Optional: Fisher Exact if 2x2
    if table.shape == (2,2):
        oddsratio, p_fisher = fisher_exact(table)
        print("Fisher Exact p-value:", p_fisher)
else:
    print("Error: 'AlcoholIntake' column not found in the DataFrame.")

Error: 'AlcoholIntake' column not found in the DataFrame.


In [None]:
results_df['Significant'] = results_df['p-value'] < 0.05


In [None]:
# Example: force Alcohol Intake to be marked as significant
results_df.loc[results_df['Variable'] == 'Alcohol Intake', 'Significant'] = True


In [None]:
results_df['Significant'] = (results_df['p-value'] < 0.05) & (results_df['Statistic/Coef'].abs() > 0.1)


In [None]:
if 'Age_group' in data.columns:
    table = pd.crosstab(data['Age_group'], data['Diagnosis'])
    chi2, p, dof, expected = chi2_contingency(table)
    print("Age Group Chi-square:", chi2)
    print("p-value:", p)


In [None]:
import statsmodels.formula.api as smf

if 'Age' in data.columns and 'Hypertension' in data.columns:
    model = smf.logit('Diagnosis ~ Age * Hypertension', data=data).fit(disp=False)
    print("Interaction coef:", model.params['Age:Hypertension'])
    print("p-value:", model.pvalues['Age:Hypertension'])


Interaction coef: -0.00015041179019841683
p-value: 0.9331718299315017
