In [None]:
import pandas as pd
df=pd.read_csv('Strokes.csv')
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,avg_glucose_level_2
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1,233.00842
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1,205.245454
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1,107.460932
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1,173.376126
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1,175.038492


In [None]:
# multiple linear regression taking average glocose as dependent variable and age, gender, marital status and hypertension as the independent variable

import statsmodels.api as sm
import pandas as pd

# Assuming the previous code loaded the dataframe 'df'

# Prepare the data
# Handle categorical variables (Gender and Marital Status) if they are not already numerical
# You might need to use one-hot encoding if they are strings
df['gender_numeric'] = df['gender'].astype('category').cat.codes
df['marital_status_numeric'] = df['ever_married'].astype('category').cat.codes # Assuming 'ever_married' is the marital status column

# Define dependent and independent variables
y = df['avg_glucose_level']
X = df[['age', 'gender_numeric', 'marital_status_numeric', 'hypertension']]

# Add a constant term to the independent variables (required by statsmodels)
X = sm.add_constant(X)

# Create and fit the OLS model
model = sm.OLS(y, X).fit()

# Print the regression results
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:      avg_glucose_level   R-squared:                       0.073
Model:                            OLS   Adj. R-squared:                  0.072
Method:                 Least Squares   F-statistic:                     100.4
Date:                Mon, 07 Jul 2025   Prob (F-statistic):           1.99e-82
Time:                        00:27:00   Log-Likelihood:                -26541.
No. Observations:                5110   AIC:                         5.309e+04
Df Residuals:                    5105   BIC:                         5.312e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                     84

In [None]:
# prompt: Test for multicollinearity

import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculate VIF for each independent variable
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print("\nVIF Results:")
vif_data


VIF Results:


Unnamed: 0,feature,VIF
0,const,5.524865
1,age,1.958004
2,gender_numeric,1.001926
3,marital_status_numeric,1.858497
4,hypertension,1.084855


In [None]:
# prompt: Splite in test and training and  select best model using different set of independent variables

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import itertools

# Assuming df is already loaded from the previous code

# Prepare the data (ensure categorical variables are handled)
# Re-run this part if df was not loaded or if you need to confirm encoding
# Handle categorical variables (Gender and Marital Status) if they are not already numerical
# You might need to use one-hot encoding if they are strings
if 'gender_numeric' not in df.columns:
    df['gender_numeric'] = df['gender'].astype('category').cat.codes
if 'marital_status_numeric' not in df.columns:
    df['marital_status_numeric'] = df['ever_married'].astype('category').cat.codes # Assuming 'ever_married' is the marital status column

# Define dependent variable
y = df['avg_glucose_level']

# Define all potential independent variables
all_independent_vars = ['age', 'gender_numeric', 'marital_status_numeric', 'hypertension']

# Split data into training and testing sets
X_train_all, X_test_all, y_train, y_test = train_test_split(df[all_independent_vars], y, test_size=0.2, random_state=42)


In [None]:
best_model = None
best_r2 = -float('inf')
best_features = None

# Iterate through all possible combinations of independent variables
for i in range(1, len(all_independent_vars) + 1):
    for combo in itertools.combinations(all_independent_vars, i):
        current_features = list(combo)
         # Select the current set of features for training and testing
        X_train_current = X_train_all[current_features]
        X_test_current = X_test_all[current_features]

        # Create and train the Linear Regression model
        model = LinearRegression()
        model.fit(X_train_current, y_train)

        # Make predictions on the test set
        y_pred = model.predict(X_test_current)

        # Evaluate the model
        r2 = r2_score(y_test, y_pred)

        # Check if this model is better than the current best
        if r2 > best_r2:
            best_r2 = r2
            best_model = model
            best_features = current_features

In [None]:
# Check R sqaure for full model

# Print R^2 for the current combination
print(f"Features: {current_features}, R^2: {r2:.4f}")

Features: ['age', 'gender_numeric', 'marital_status_numeric', 'hypertension'], R^2: 0.0381


In [None]:
# The rest of the code to print the best R^2 and features remains unchanged
print(f"Best R^2 score: {best_r2}")
print(f"Best model features: {best_features}")

# You can now use the best_model with the best_features for further analysis or predictions
# Example: Make predictions on the test set using the best model
# best_y_pred = best_model.predict(X_test_all[best_features])

Best R^2 score: 0.04503822219811693
Best model features: ['age', 'marital_status_numeric']
