In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.naive_bayes import GaussianNB  # Importing Naive Bayes
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns


In [None]:
X, y = make_classification(n_samples=1000, n_features=10, n_classes=2,
                           class_sep=2, weights=[0.9, 0.1], random_state=42)

print("Original class distribution:", Counter(y)) #counter is used to count the number of occurrences

# Plot the class distribution
plt.figure(figsize=(6,4))
plt.bar(['Class 0', 'Class 1'], [Counter(y)[0], Counter(y)[1]], color=['cyan', 'black'])
plt.title('Original Class Distribution')
plt.ylabel('Frequency')
plt.show()

# Apply SMOTE (Synthetic Minority Over-sampling Technique) for oversampling
smote = SMOTE(random_state=42)

X_res, y_res = smote.fit_resample(X, y)

# Display new class distribution
print("Resampled class distribution:", Counter(y_res))

# Plot the resampled class distribution
plt.figure(figsize=(6,4))
plt.bar(['Class 0', 'Class 1'], [Counter(y_res)[0], Counter(y_res)[1]], color=['cyan', 'black'])
plt.title('Resampled Class Distribution (SMOTE)')
plt.ylabel('Frequency')
plt.show()

# Train and evaluate a Naive Bayes classifier on the resampled data
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42)

# Using Naive Bayes Classifier
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred_resampled = clf.predict(X_test)

# Evaluate the model on the resampled dataset
print("Classification report on resampled data:")
print(classification_report(y_test, y_pred_resampled))

# Confusion Matrix for resampled data
cm_resampled = confusion_matrix(y_test, y_pred_resampled)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_resampled, display_labels=['Class 0', 'Class 1'])
disp.plot(cmap='Blues')
plt.title("Confusion Matrix - Resampled Data")
plt.show()

# Now, evaluate the model on the original imbalanced data
X_train_imbalanced, X_test_imbalanced, y_train_imbalanced, y_test_imbalanced = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the Naive Bayes Classifier on imbalanced data
clf.fit(X_train_imbalanced, y_train_imbalanced)
y_pred_imbalanced = clf.predict(X_test_imbalanced)

# Evaluate the model on the original imbalanced dataset
print("Classification report on imbalanced data:")
print(classification_report(y_test_imbalanced, y_pred_imbalanced))

# Confusion Matrix for imbalanced data
cm_imbalanced = confusion_matrix(y_test_imbalanced, y_pred_imbalanced)
disp_imbalanced = ConfusionMatrixDisplay(confusion_matrix=cm_imbalanced, display_labels=['Class 0', 'Class 1'])
disp_imbalanced.plot(cmap='Blues')
plt.title("Confusion Matrix - Imbalanced Data")
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Generate data with a linear relationship (y = 2X + 1)
np.random.seed(42)
X = np.linspace(0, 10, 100)#generates 100 evenly spaced points in the range from 0 to 10
y = 2 * X + 1  # True underlying function

# Add some noise to the data
noise = np.random.normal(0, 2, X.shape)#take ranom values from a normal distribution of mean 0 and SD 2 same as X.shape
y_noisy = y + noise #y is y added with noise

# Introduce outliers
X_outliers = np.array([2, 4, 6, 8])
y_outliers = np.array([25, 30, 28, 35])  # Outliers with large values
X_combined = np.concatenate((X, X_outliers))
y_combined = np.concatenate((y_noisy, y_outliers))

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)

# Reshape data to fit into linear regression model
X_train = X_train.reshape(-1, 1) #1: Tells NumPy to automatically calculate the number of rows based on the total number of elements

# and the specified number of columns (1 in this case).
X_test = X_test.reshape(-1, 1)


# Fit the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on both training and testing data
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
# Plot the results
plt.figure(figsize=(10, 6))
plt.scatter(X_train, y_train, color='blue', label='Training data')
plt.scatter(X_test, y_test, color='green', label='Test data')
plt.plot(X_test, y_test_pred, color='cyan', label='2nd Fitted line')
plt.plot(X_train, y_train_pred, color='red', label='1st Fitted line (with outliers)',linestyle='-.')
plt.plot(X, y, color='black', label='True line (y = 2X + 1)', linestyle='--')
plt.legend()
plt.xlabel('X')
plt.ylabel('y')
plt.title('Linear Regression with Outliers')
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from scipy.stats import entropy

# Step 1: Generate synthetic data (initial dataset)
np.random.seed(42)
n_samples = 500
X1 = np.random.normal(0, 1, n_samples)  # Feature 1
X2 = np.random.normal(0, 1, n_samples)  # Feature 2
X = np.column_stack((X1, X2))
y = (X1 + X2 > 0).astype(int)  # Binary target

# Step 2: Fit logistic regression on the initial data
model = LogisticRegression()
model.fit(X, y)

# Step 3: Simulate drift (change the distribution of features)
X1_drifted = np.random.normal(1, 1, n_samples)  # Shift mean from 0 to 1
X2_drifted = np.random.normal(1, 1, n_samples)  # Shift mean from 0 to 1
X_drifted = np.column_stack((X1_drifted, X2_drifted))
y_drifted = (X1_drifted + X2_drifted > 0).astype(int)  # New labels based on drifted data

# Step 4: Detect drift using KL divergence
def kl_divergence(p, q, bins=10):
    """Calculate KL divergence between two distributions."""
    p_hist, _ = np.histogram(p, bins=bins, density=True)
    q_hist, _ = np.histogram(q, bins=bins, density=True)
    p_hist += 1e-10  # Avoid division by zero
    q_hist += 1e-10
    return entropy(p_hist, q_hist)

# Calculate KL divergence for each feature
kl_X1 = kl_divergence(X1, X1_drifted)
kl_X2 = kl_divergence(X2, X2_drifted)

# Step 5: Evaluate model performance on drifted data
y_pred_drifted = model.predict(X_drifted)
accuracy_drifted = accuracy_score(y_drifted, y_pred_drifted)
log_loss_drifted = log_loss(y_drifted, model.predict_proba(X_drifted))

# Step 6: Visualization
plt.figure(figsize=(12, 5))

# Plot original and drifted distributions
plt.subplot(1, 2, 1)
plt.hist(X1, bins=20, alpha=0.6, label="Feature 1 (original)", color="blue")
plt.hist(X1_drifted, bins=20, alpha=0.6, label="Feature 1 (drifted)", color="orange")
plt.title(f"KL Divergence for Feature 1: {kl_X1:.4f}")
plt.legend()

plt.subplot(1, 2, 2)
plt.hist(X2, bins=20, alpha=0.6, label="Feature 2 (original)", color="blue")
plt.hist(X2_drifted, bins=20, alpha=0.6, label="Feature 2 (drifted)", color="orange")
plt.title(f"KL Divergence for Feature 2: {kl_X2:.4f}")
plt.legend()

plt.show()

# Step 7: Print performance metrics
print(f"Model accuracy on drifted data: {accuracy_drifted:.4f}")
print(f"Log loss on drifted data: {log_loss_drifted:.4f}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Simulated dataset
data = pd.DataFrame({
    'Income': [30000, 45000, 60000, 80000, 20000, 50000],
    'CreditScore': [600, 650, 700, 750, 550, 680],
    'Gender': [0, 1, 0, 1, 0, 1],  # 0: Male, 1: Female
    'Approved': [0, 1, 1, 1, 0, 1]
})

X = data[['Income', 'CreditScore', 'Gender']]
y = data['Approved']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Display results
print("Classification Report:\n", classification_report(y_test, y_pred))

# Identifying bias in coefficients
print("Model Coefficients:", model.coef_)


In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
import numpy as np

# Small synthetic dataset
X = np.random.rand(100, 1)
y = X**2 + np.random.normal(0, 0.05, (100, 1))

# Train-test split
X_train, X_test = X[:80], X[80:]
y_train, y_test = y[:80], y[80:]

# Overfitted Neural Network
model = Sequential([
    Dense(128, activation='relu', input_dim=1),
    Dense(128, activation='relu'),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.fit(X_train, y_train, epochs=200, verbose=0)

# Evaluation
train_loss = model.evaluate(X_train, y_train, verbose=0)
test_loss = model.evaluate(X_test, y_test, verbose=0)
print("Train Loss:", train_loss)
print("Test Loss:", test_loss)
