In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# Load the dataset and  Display the first few rows of the dataset
df = pd.read_csv("/content/diabetes.csv")
df.head()

In [None]:
# Checking for missing values
df.isnull().sum()

In [None]:
# Handling zero values in columns where they are not appropriate (e.g., Glucose, BloodPressure, BMI, etc.)
# We will replace zero values with NaN to handle them later
df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = df[['Glucose', 'BloodPressure',
                                                                              'SkinThickness', 'Insulin',
                                                                              'BMI']].replace(0, np.nan)

In [None]:
# Display the updated dataset with NaN values for incorrect zeros
df.head()

In [None]:
# Handling missing data (e.g., filling NaN values with mean or median)
df['Glucose'].fillna(df['Glucose'].mean(), inplace=True)
df['BloodPressure'].fillna(df['BloodPressure'].mean(), inplace=True)
df['SkinThickness'].fillna(df['SkinThickness'].median(), inplace=True)
df['Insulin'].fillna(df['Insulin'].median(), inplace=True)
df['BMI'].fillna(df['BMI'].mean(), inplace=True)

In [None]:
# Verify if all missing values are handled
df.isnull().sum() 

In [None]:
# Descriptive statistics to understand the distribution of data
df.describe()

In [None]:
# Display basic information about the data
df.info()

In [None]:
# Set the style for seaborn
sns.set(style="whitegrid")

# Plot histograms for each feature
df.hist(bins=20, figsize=(14, 10), color='teal')
plt.suptitle('Distribution of Features')
plt.show()

In [None]:
# Correlation matrix with heatmap
plt.figure(figsize=(12, 8))
corr_matrix = df.corr()

# Using seaborn to plot the heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Features')
plt.show()

In [None]:
# Plot count of the target variable 'Outcome'
sns.countplot(x='Outcome', data=df, palette='coolwarm')
plt.title('Outcome Distribution')
plt.show()

In [None]:
# Pairplot to visualize the relationships between variables
sns.pairplot(df, hue="Outcome", palette='coolwarm')
plt.suptitle('Pairplot of Features', y=1.02)
plt.show()

In [None]:
# Boxplot for Glucose vs Outcome
plt.figure(figsize=(8, 6))
sns.boxplot(x='Outcome', y='Glucose', data=df, palette='coolwarm')
plt.title('Glucose Levels by Outcome')
plt.show()

# Boxplot for BMI vs Outcome
plt.figure(figsize=(8, 6))
sns.boxplot(x='Outcome', y='BMI', data=df, palette='coolwarm')
plt.title('BMI Levels by Outcome')
plt.show()

In [None]:
# Distribution plot for Glucose
plt.figure(figsize=(8, 6))
sns.histplot(df['Glucose'], kde=True, color='teal')
plt.title('Distribution of Glucose Levels')
plt.show()

# Distribution plot for BMI
plt.figure(figsize=(8, 6))
sns.histplot(df['BMI'], kde=True, color='coral')
plt.title('Distribution of BMI Levels')
plt.show()

In [None]:
# Step 1: Split the data into features (X) and target (y)
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [None]:
# Step 2: Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 3: Scale the features for better SVM performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Step 4: Train the SVM model
svm_model = SVC(kernel='linear')  # Linear kernel for simplicity
svm_model.fit(X_train_scaled, y_train)

In [None]:
# Step 5: Make predictions on the test set
y_pred = svm_model.predict(X_test_scaled)

In [None]:
# Step 6: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(f"conf_matrix: {conf_matrix}")
print(f"class_report: {class_report}")