<a href="https://colab.research.google.com/github/rishisg/ChatGPT/blob/main/KNN_Classification_Obesity_EmailSpam_Heart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# K-Nearest Neighbors (KNN) for Obesity Classification Dataset, Email Spam Classification Dataset and Heart Attack Analysis & Prediction Dataset
# 1. Setup and Data Preparation
# We’ll begin by importing the necessary libraries and loading the datasets.

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer


In [5]:
# 2. Load the Datasets
# We'll load each dataset for further analysis and preprocessing.

# Load Obesity Classification dataset
obesity_df = pd.read_csv('Obesity Classification.csv')

# Display first few rows of the dataset
obesity_df.head()


Unnamed: 0,ID,Age,Gender,Height,Weight,BMI,Label
0,1,25,Male,175,80,25.3,Normal Weight
1,2,30,Female,160,60,22.5,Normal Weight
2,3,35,Male,180,90,27.3,Overweight
3,4,40,Female,150,50,20.0,Underweight
4,5,45,Male,190,100,31.2,Obese


In [7]:
# Load Email Spam Classification dataset
spam_df = pd.read_csv('emails.csv')

# Display first few rows of the dataset
spam_df.head()


Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [8]:
# Load Heart Attack Prediction dataset
heart_df = pd.read_csv('heart.csv')

# Display first few rows of the dataset
heart_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [9]:
# 3. Preprocessing and Feature Engineering
# Handling Missing Values
# We'll use SimpleImputer for handling missing values in each dataset.

# Handle missing values for each dataset
obesity_imputer = SimpleImputer(strategy="mean")
obesity_df_imputed = pd.DataFrame(obesity_imputer.fit_transform(obesity_df.select_dtypes(include=[np.number])))

spam_imputer = SimpleImputer(strategy="mean")
spam_df_imputed = pd.DataFrame(spam_imputer.fit_transform(spam_df.select_dtypes(include=[np.number])))

heart_imputer = SimpleImputer(strategy="mean")
heart_df_imputed = pd.DataFrame(heart_imputer.fit_transform(heart_df.select_dtypes(include=[np.number])))


In [10]:
# Encode Categorical Variables
# For KNN, categorical variables should be encoded into numerical values.

# For Obesity Dataset
obesity_df['Gender'] = LabelEncoder().fit_transform(obesity_df['Gender'])
obesity_df['Label'] = LabelEncoder().fit_transform(obesity_df['Label'])

# For Spam Dataset
spam_df_encoded = pd.get_dummies(spam_df, drop_first=True)

# For Heart Attack Dataset
heart_df['Sex'] = LabelEncoder().fit_transform(heart_df['Sex'])
heart_df['ChestPainType'] = LabelEncoder().fit_transform(heart_df['ChestPainType'])
heart_df['RestingECG'] = LabelEncoder().fit_transform(heart_df['RestingECG'])
heart_df['ExerciseAngina'] = LabelEncoder().fit_transform(heart_df['ExerciseAngina'])
heart_df['ST_Slope'] = LabelEncoder().fit_transform(heart_df['ST_Slope'])
heart_df['HeartDisease'] = LabelEncoder().fit_transform(heart_df['HeartDisease'])


In [11]:
# Feature Scaling
# Since KNN is sensitive to the scale of the data, we’ll apply StandardScaler to all numeric features.

# For Obesity Dataset
scaler = StandardScaler()
obesity_df_scaled = scaler.fit_transform(obesity_df.select_dtypes(include=[np.number]))

# For Spam Dataset
spam_df_scaled = scaler.fit_transform(spam_df_encoded)

# For Heart Attack Dataset
heart_df_scaled = scaler.fit_transform(heart_df.select_dtypes(include=[np.number]))


In [13]:
# Splitting the Data into Training and Testing Sets

# For Obesity Dataset
X_obesity = obesity_df_scaled
y_obesity = obesity_df['Label']
X_train_obesity, X_test_obesity, y_train_obesity, y_test_obesity = train_test_split(X_obesity, y_obesity, test_size=0.3, random_state=42)

# For Spam Dataset
# Check the columns in the spam DataFrame
print(spam_df.columns)

# Assuming the target column is 'target' (replace with correct column name)
X_spam = spam_df_scaled
y_spam = spam_df['target']  # Correct the column name if needed
X_train_spam, X_test_spam, y_train_spam, y_test_spam = train_test_split(X_spam, y_spam, test_size=0.3, random_state=42)

# For Heart Attack Dataset
X_heart = heart_df_scaled
y_heart = heart_df['HeartDisease']
X_train_heart, X_test_heart, y_train_heart, y_test_heart = train_test_split(X_heart, y_heart, test_size=0.3, random_state=42)


Index(['Email No.', 'the', 'to', 'ect', 'and', 'for', 'of', 'a', 'you', 'hou',
       ...
       'connevey', 'jay', 'valued', 'lay', 'infrastructure', 'military',
       'allowing', 'ff', 'dry', 'Prediction'],
      dtype='object', length=3002)


In [14]:
# 4. KNN Model Construction
# Training the KNN Model
# For this, we’ll choose K=5 as a starting point, but this will be adjusted later based on hyperparameter tuning.

# For Obesity Dataset
knn_obesity = KNeighborsClassifier(n_neighbors=5)
knn_obesity.fit(X_train_obesity, y_train_obesity)

# For Spam Dataset
knn_spam = KNeighborsClassifier(n_neighbors=5)
knn_spam.fit(X_train_spam, y_train_spam)

# For Heart Attack Dataset
knn_heart = KNeighborsClassifier(n_neighbors=5)
knn_heart.fit(X_train_heart, y_train_heart)


In [15]:
# 5. Model Evaluation
# We will evaluate the models using Accuracy, Classification Report, Confusion Matrix,
# and ROC-AUC for binary classification.

# For Obesity Dataset
y_pred_obesity = knn_obesity.predict(X_test_obesity)
print("Obesity Model Accuracy:", accuracy_score(y_test_obesity, y_pred_obesity))
print("Classification Report:\n", classification_report(y_test_obesity, y_pred_obesity))
print("Confusion Matrix:\n", confusion_matrix(y_test_obesity, y_pred_obesity))

# For Spam Dataset
y_pred_spam = knn_spam.predict(X_test_spam)
print("Spam Model Accuracy:", accuracy_score(y_test_spam, y_pred_spam))
print("Classification Report:\n", classification_report(y_test_spam, y_pred_spam))
print("Confusion Matrix:\n", confusion_matrix(y_test_spam, y_pred_spam))

# For Heart Attack Dataset
y_pred_heart = knn_heart.predict(X_test_heart)
print("Heart Attack Model Accuracy:", accuracy_score(y_test_heart, y_pred_heart))
print("Classification Report:\n", classification_report(y_test_heart, y_pred_heart))
print("Confusion Matrix:\n", confusion_matrix(y_test_heart, y_pred_heart))


Obesity Model Accuracy: 0.9696969696969697
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       0.83      1.00      0.91         5
           2       1.00      0.89      0.94         9
           3       1.00      1.00      1.00        13

    accuracy                           0.97        33
   macro avg       0.96      0.97      0.96        33
weighted avg       0.97      0.97      0.97        33

Confusion Matrix:
 [[ 6  0  0  0]
 [ 0  5  0  0]
 [ 0  1  8  0]
 [ 0  0  0 13]]
Spam Model Accuracy: 0.9864690721649485
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      1530
           1       0.67      0.13      0.22        15
           2       0.00      0.00      0.00         3
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         3

    accuracy                

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
# 6. Hyperparameter Tuning and Model Optimization
# We will perform Grid Search for K to find the best hyperparameter.

from sklearn.model_selection import GridSearchCV

# Grid search for Obesity Dataset
param_grid = {'n_neighbors': np.arange(1, 21)}
grid_search_obesity = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_search_obesity.fit(X_train_obesity, y_train_obesity)
print(f"Best K for Obesity Dataset: {grid_search_obesity.best_params_}")

# Grid search for Spam Dataset
grid_search_spam = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_search_spam.fit(X_train_spam, y_train_spam)
print(f"Best K for Spam Dataset: {grid_search_spam.best_params_}")

# Grid search for Heart Attack Dataset
grid_search_heart = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid_search_heart.fit(X_train_heart, y_train_heart)
print(f"Best K for Heart Attack Dataset: {grid_search_heart.best_params_}")


Best K for Obesity Dataset: {'n_neighbors': np.int64(1)}




Best K for Spam Dataset: {'n_neighbors': np.int64(1)}
Best K for Heart Attack Dataset: {'n_neighbors': np.int64(1)}
