In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
dataset = pd.read_csv('/content/drive/MyDrive/major project/data set 1.csv')
dataset.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
# Step 2: Data Preprocessing
# Fill missing values in the 'bmi' column with the mean
dataset['bmi'].fillna(dataset['bmi'].mean(), inplace=True)

# Convert categorical columns into numerical formats using one-hot encoding
categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
processed_dataset = pd.get_dummies(dataset, columns=categorical_columns, drop_first=True)

# Step 3: Define Features and Target
X = processed_dataset.drop(columns=['id', 'stroke'])  # Drop irrelevant and target columns
y = processed_dataset['stroke']

# Step 4: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 5: Train the Naive Bayes Classifier
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Step 6: Make Predictions
y_pred = nb_model.predict(X_test)

# Step 7: Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Step 8: Display Results
print("Model Accuracy:", accuracy)
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)


Model Accuracy: 0.44129158512720157

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.41      0.59       972
           1       0.08      0.96      0.14        50

    accuracy                           0.44      1022
   macro avg       0.54      0.69      0.36      1022
weighted avg       0.95      0.44      0.56      1022


Confusion Matrix:
 [[403 569]
 [  2  48]]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['bmi'].fillna(dataset['bmi'].mean(), inplace=True)
