In [None]:
//imbalanced Data data set
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.naive_bayes import GaussianNB  # Importing Naive Bayes
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns

# Generate imbalanced dataset , Synthetic Minority Over-sampling Technique
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2,
                           class_sep=2, weights=[0.9, 0.1], random_state=42)

# Display class distribution
print("Original class distribution:", Counter(y))

# Plot the class distribution
plt.figure(figsize=(6,4))
plt.bar(['Class 0', 'Class 1'], [Counter(y)[0], Counter(y)[1]], color=['blue', 'orange'])
plt.title('Original Class Distribution')
plt.ylabel('Frequency')
plt.show()

# Apply SMOTE (Synthetic Minority Over-sampling Technique) for oversampling
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Display new class distribution
print("Resampled class distribution:", Counter(y_res))

# Plot the resampled class distribution
plt.figure(figsize=(6,4))
plt.bar(['Class 0', 'Class 1'], [Counter(y_res)[0], Counter(y_res)[1]], color=['blue', 'orange'])
plt.title('Resampled Class Distribution (SMOTE)')
plt.ylabel('Frequency')
plt.show()

# Train and evaluate a Naive Bayes classifier on the resampled data
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42)

# Using Naive Bayes Classifier
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred_resampled = clf.predict(X_test)

# Evaluate the model on the resampled dataset
print("Classification report on resampled data:")
print(classification_report(y_test, y_pred_resampled))

# Confusion Matrix for resampled data
cm_resampled = confusion_matrix(y_test, y_pred_resampled)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_resampled, display_labels=['Class 0', 'Class 1'])
disp.plot(cmap='Blues')
plt.title("Confusion Matrix - Resampled Data")
plt.show()

# Now, evaluate the model on the original imbalanced data
X_train_imbalanced, X_test_imbalanced, y_train_imbalanced, y_test_imbalanced = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the Naive Bayes Classifier on imbalanced data
clf.fit(X_train_imbalanced, y_train_imbalanced)
y_pred_imbalanced = clf.predict(X_test_imbalanced)

# Evaluate the model on the original imbalanced dataset
print("Classification report on imbalanced data:")
print(classification_report(y_test_imbalanced, y_pred_imbalanced))

# Confusion Matrix for imbalanced data
cm_imbalanced = confusion_matrix(y_test_imbalanced, y_pred_imbalanced)
disp_imbalanced = ConfusionMatrixDisplay(confusion_matrix=cm_imbalanced, display_labels=['Class 0', 'Class 1'])
disp_imbalanced.plot(cmap='Blues')
plt.title("Confusion Matrix - Imbalanced Data")
plt.show()

# You can compare metrics like precision, recall, and F1-score to see how much improvement SMOTE brings


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Generate data with a linear relationship (y = 2X + 1)
np.random.seed(42)
X = np.linspace(0, 10, 100)
y = 2 * X + 1  # True underlying function

# Add some noise to the data
noise = np.random.normal(0, 2, X.shape)
y_noisy = y + noise

# Introduce outliers
X_outliers = np.array([2, 4, 6, 8])
y_outliers = np.array([25, 30, 28, 35])  # Outliers with large values
X_combined = np.concatenate((X, X_outliers))
y_combined = np.concatenate((y_noisy, y_outliers))

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

# Reshape data to fit into linear regression model
X_train = X_train.reshape(-1, 1)
X_test = X_test.reshape(-1, 1)

# Fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on both training and testing data
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Plot the results
plt.figure(figsize=(10, 6))
plt.scatter(X_train, y_train, color='blue', label='Training data')
plt.scatter(X_test, y_test, color='green', label='Test data')
plt.plot(X_train, y_train_pred, color='red', label='Fitted line (with outliers)')
plt.plot(X, y, color='black', label='True line (y = 2X + 1)', linestyle='--')
plt.legend()
plt.xlabel('X')
plt.ylabel('y')
plt.title('Linear Regression with Outliers')
plt.show()


The blue dots represent the training data, which includes both the normal data and the outliers.
The green dots represent the test data.
The red line is the fitted line from the linear regression model. Notice how it is significantly influenced by the outliers, pulling the line away from the true underlying relationship (the black dashed line).
The black dashed line is the true relationship y = 2X + 1.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Step 1: Generate synthetic data (2 features, binary target)
np.random.seed(42)

# Create 100 samples with two features
X = np.random.randn(100, 2)

# Assign labels: 0 for class 0, 1 for class 1
y = (X[:, 0] + X[:, 1] > 0).astype(int)

# Step 2: Introduce noise in the data (adding random noise)
X_noisy = X + np.random.normal(0, 1, X.shape)  # Adding noise with mean 0 and std 1

# Step 3: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_noisy, y, test_size=0.3, random_state=42)

# Step 4: Apply Naive Bayes classifier
model = GaussianNB()
model.fit(X_train, y_train)

# Step 5: Make predictions
y_pred = model.predict(X_test)

# Step 6: Evaluate performance
accuracy = accuracy_score(y_test, y_pred)

# Step 7: Visualize the noisy data and decision boundary
plt.figure(figsize=(8, 6))
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap='coolwarm', marker='o', label='Train data')
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap='coolwarm', marker='x', label='Test data')

# Plot decision boundary
x_min, x_max = X_noisy[:, 0].min() - 1, X_noisy[:, 0].max() + 1
y_min, y_max = X_noisy[:, 1].min() - 1, X_noisy[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.contourf(xx, yy, Z, alpha=0.3)
plt.title(f'Naive Bayes Classifier with Noisy Data\nAccuracy: {accuracy:.2f}')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from scipy.stats import entropy

# Step 1: Generate synthetic data (initial dataset)
np.random.seed(42)
n_samples = 500
X1 = np.random.normal(0, 1, n_samples)  # Feature 1
X2 = np.random.normal(0, 1, n_samples)  # Feature 2
X = np.column_stack((X1, X2))
y = (X1 + X2 > 0).astype(int)  # Binary target

# Step 2: Fit logistic regression on the initial data
model = LogisticRegression()
model.fit(X, y)

# Step 3: Simulate drift (change the distribution of features)
X1_drifted = np.random.normal(1, 1, n_samples)  # Shift mean from 0 to 1
X2_drifted = np.random.normal(1, 1, n_samples)  # Shift mean from 0 to 1
X_drifted = np.column_stack((X1_drifted, X2_drifted))
y_drifted = (X1_drifted + X2_drifted > 0).astype(int)  # New labels based on drifted data

# Step 4: Detect drift using KL divergence
def kl_divergence(p, q, bins=10):
    """Calculate KL divergence between two distributions."""
    p_hist, _ = np.histogram(p, bins=bins, density=True)
    q_hist, _ = np.histogram(q, bins=bins, density=True)
    p_hist += 1e-10  # Avoid division by zero
    q_hist += 1e-10
    return entropy(p_hist, q_hist)

# Calculate KL divergence for each feature
kl_X1 = kl_divergence(X1, X1_drifted)
kl_X2 = kl_divergence(X2, X2_drifted)

# Step 5: Evaluate model performance on drifted data
y_pred_drifted = model.predict(X_drifted)
accuracy_drifted = accuracy_score(y_drifted, y_pred_drifted)
log_loss_drifted = log_loss(y_drifted, model.predict_proba(X_drifted))

# Step 6: Visualization
plt.figure(figsize=(12, 5))

# Plot original and drifted distributions
plt.subplot(1, 2, 1)
plt.hist(X1, bins=20, alpha=0.6, label="Feature 1 (original)", color="blue")
plt.hist(X1_drifted, bins=20, alpha=0.6, label="Feature 1 (drifted)", color="orange")
plt.title(f"KL Divergence for Feature 1: {kl_X1:.4f}")
plt.legend()

plt.subplot(1, 2, 2)
plt.hist(X2, bins=20, alpha=0.6, label="Feature 2 (original)", color="blue")
plt.hist(X2_drifted, bins=20, alpha=0.6, label="Feature 2 (drifted)", color="orange")
plt.title(f"KL Divergence for Feature 2: {kl_X2:.4f}")
plt.legend()

plt.show()

# Step 7: Print performance metrics
print(f"Model accuracy on drifted data: {accuracy_drifted:.4f}")
print(f"Log loss on drifted data: {log_loss_drifted:.4f}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Simulated dataset
data = pd.DataFrame({
    'Income': [30000, 45000, 60000, 80000, 20000, 50000],
    'CreditScore': [600, 650, 700, 750, 550, 680],
    'Gender': [0, 1, 0, 1, 0, 1],  # 0: Male, 1: Female
    'Approved': [0, 1, 1, 1, 0, 1]
})

X = data[['Income', 'CreditScore', 'Gender']]
y = data['Approved']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Display results
print("Classification Report:\n", classification_report(y_test, y_pred))

# Identifying bias in coefficients
print("Model Coefficients:", model.coef_)


In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import cv2
import matplotlib.pyplot as plt

# Function to add noise to an image
def add_noise(image):
    noise = np.random.normal(0, 0.1, image.shape)
    noisy_image = image + noise
    return np.clip(noisy_image, 0, 1)

# Generate a simple dataset of images
def generate_dataset(image_path, num_samples=100):
    # Load and preprocess the original image
    original_image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    original_image = cv2.resize(original_image, (64, 64))  # Resize for simplicity
    original_image = original_image / 255.0  # Normalize to [0, 1]

    # Create dataset
    images = []
    labels = []  # 0 for original, 1 for noisy

    for _ in range(num_samples // 2):
        # Add original image (class 0)
        images.append(original_image.flatten())  # Flatten the image
        labels.append(0)

        # Add noisy version (class 1)
        noisy_image = add_noise(original_image)
        images.append(noisy_image.flatten())
        labels.append(1)

    return np.array(images), np.array(labels)

# Define a simple deep learning model
class SimpleNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 2)  # Two classes: Original and Noisy
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Generate dataset
image_path = 'image.jpg'  # Replace with your image file
X, y = generate_dataset(image_path, num_samples=200)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Define model, loss function, and optimizer
input_size = X_train.shape[1]
model = SimpleNN(input_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
epochs = 20
for epoch in range(epochs):
    # Forward pass
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)

    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}")

# Evaluate the model
with torch.no_grad():
    y_pred = model(X_test_tensor)
    y_pred_classes = torch.argmax(y_pred, dim=1).numpy()
    accuracy = accuracy_score(y_test, y_pred_classes)
    print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Visualize a test image with prediction
test_image = X_test[0].reshape(64, 64)
prediction = y_pred_classes[0]

plt.imshow(test_image, cmap='gray')
plt.title(f"Predicted: {'Noisy' if prediction == 1 else 'Original'}")
plt.axis('off')
plt.show()
