In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 1: Load the dataset (adjust path if needed)
data = pd.read_csv('/content/diabetes.csv')

# Step 2: Check the first few rows of the dataset
print(data.head())

# Step 3: Preprocessing
# Check for missing values and handle them if necessary
print(data.isnull().sum())

# If there are any missing values, fill them with mean or median (as needed)
# Example:
# data.fillna(data.mean(), inplace=True)

# Step 4: Feature selection (assuming the last column is the target variable)
X = data.iloc[:, :-1]  # Features (all columns except the last one)
y = data.iloc[:, -1]   # Target (last column, which is the 'Outcome')

# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 7: Build the SVM Model
model = SVC(kernel='linear')  # You can change the kernel if needed, e.g., 'rbf', 'poly', etc.
model.fit(X_train, y_train)

# Step 8: Make Predictions
y_pred = model.predict(X_test)

# Step 9: Evaluate the Model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Step 10: Visualization (Optional but useful)
# Confusion Matrix Visualization
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues', xticklabels=['No Diabetes', 'Diabetes'], yticklabels=['No Diabetes', 'Diabetes'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()



In [None]:
# Importing additional libraries
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Step 1: Load the dataset (adjust path if needed)
data = pd.read_csv('/content/diabetes.csv')

# Step 2: Check the first few rows of the dataset
print(data.head())

# Step 3: Preprocessing
# Check for missing values and handle them if necessary
print(data.isnull().sum())

# If there are any missing values, fill them with mean or median (as needed)
# Example:
# data.fillna(data.mean(), inplace=True)

# Step 4: Feature selection (assuming the last column is the target variable)
X = data.iloc[:, :-1]  # Features (all columns except the last one)
y = data.iloc[:, -1]   # Target (last column, which is the 'Outcome')

# Step 5: Train-Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Feature scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 7: Build the SVM Model
from sklearn.svm import SVC
model = SVC(kernel='linear')  # You can change the kernel if needed, e.g., 'rbf', 'poly', etc.
model.fit(X_train, y_train)

# Step 8: Make Predictions
y_pred = model.predict(X_test)

# Step 9: Evaluate the Model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Step 10: Additional Visualizations

## 1. Distribution of the target variable ('Outcome')
plt.figure(figsize=(6, 4))
sns.countplot(x='Outcome', data=data, palette='coolwarm')
plt.title('Distribution of Diabetes (Outcome)')
plt.xlabel('Outcome')
plt.ylabel('Count')
plt.show()

## 2. Feature distribution (for each feature)
features = X.columns
plt.figure(figsize=(15, 10))
for i, feature in enumerate(features):
    plt.subplot(3, 3, i + 1)
    sns.histplot(data[feature], kde=True, color='blue')
    plt.title(f'Distribution of {feature}')
    plt.tight_layout()
plt.show()

## 3. Correlation heatmap between features
correlation_matrix = data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

## 4. If there are two significant features, visualize the SVM decision boundary (using PCA for 2D representation)
if X.shape[1] == 2:
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=X_test[:, 0], y=X_test[:, 1], hue=y_test, palette='coolwarm', marker='o')

    # Decision Boundary
    xx, yy = np.meshgrid(np.linspace(X_test[:, 0].min(), X_test[:, 0].max(), 100),
                         np.linspace(X_test[:, 1].min(), X_test[:, 1].max(), 100))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, alpha=0.3)
    plt.title('SVM Decision Boundary')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.show()

## 5. If the data has more than 2 features, use PCA to reduce the dimensions to 2 for decision boundary visualization
else:
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X)
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)

    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=X_test_pca[:, 0], y=X_test_pca[:, 1], hue=y_test, palette='coolwarm', marker='o')

    # SVM decision boundary in 2D PCA space
    xx, yy = np.meshgrid(np.linspace(X_test_pca[:, 0].min(), X_test_pca[:, 0].max(), 100),
                         np.linspace(X_test_pca[:, 1].min(), X_test_pca[:, 1].max(), 100))
    Z = model.predict(pca.inverse_transform(np.c_[xx.ravel(), yy.ravel()]))
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, alpha=0.3)
    plt.title('SVM Decision Boundary (PCA-reduced)')
    plt.xlabel('PCA 1')
    plt.ylabel('PCA 2')
    plt.show()
