In [8]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle

# Load your dataset
df = pd.read_csv('healthcare-dataset-stroke-data.csv')

# Handle missing values and other data cleaning steps
df['bmi'].fillna(df['bmi'].mean(), inplace=True)
df.drop_duplicates(inplace=True)

# Convert categorical features to numerical
categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
label_encoders = {}
for feature in categorical_features:
    le = LabelEncoder()
    df[feature] = le.fit_transform(df[feature])
    label_encoders[feature] = le

# Define features and target
features = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 
            'work_type', 'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status']
target = 'stroke'
X = df[features]
y = df[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Save the model using pickle
model_filename = 's_prediction_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(model, file)

# Load the model from the file
with open(model_filename, 'rb') as file:
    loaded_model = pickle.load(file)


Accuracy: 0.9383561643835616
Confusion Matrix:
 [[959   1]
 [ 62   0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       960
           1       0.00      0.00      0.00        62

    accuracy                           0.94      1022
   macro avg       0.47      0.50      0.48      1022
weighted avg       0.88      0.94      0.91      1022



In [9]:
# Example input values for prediction (these should be numerical values)
input_data = {
    'gender': 1,  # Male
    'age': 81, 
    'hypertension': 0, 
    'heart_disease': 0, 
    'ever_married': 1,  # Yes
    'work_type': 3,  # Private
    'Residence_type': 1,  # Urban
    'avg_glucose_level': 125, 
    'bmi': 40, 
    'smoking_status': 2  # formerly smoked
}

# Convert input data to DataFrame
input_df = pd.DataFrame([input_data])

# Ensure the feature order matches the training data
input_df = input_df[features]

# Predict using the loaded model
prediction = loaded_model.predict(input_df)
prediction_proba = loaded_model.predict_proba(input_df)

print(f"Predicted Stroke: {prediction[0]}")
print(f"Prediction Probability: {prediction_proba[0]}")


Predicted Stroke: 0
Prediction Probability: [0.77 0.23]
