<a href="https://colab.research.google.com/github/samanthayeep/ADS_assignment/blob/main/BreastCancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Data Description

In [None]:
from google.colab import drive
drive.mount('/content/drive')

###Dataset
https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

In [None]:
# a folder to store images
import os
output_dir = 'histograms'
os.makedirs(output_dir, exist_ok=True)

In [None]:
# Import datasets
df = pd.read_csv('data.csv')

In [None]:
# View the first five rows of the data
df.head()

In [None]:
# Information on the datasets
df.info()

In [None]:
# Descriptive statistics on the datasets
df.describe()

In [None]:
# Give the name of all the columns in the datasets
df.columns

In [None]:
# Give the number of rows and columns in the datasets
df.shape


#Data Visualisation
Use visual methods to find the key features, patterns, and trends in the data

In [None]:
# Separate the labels (y) and the features (x)
# Drop unnamed features and id
y = df.diagnosis
x = df.drop(['Unnamed: 32', 'id', 'diagnosis'], axis=1)
x.head()

In [None]:
# Compare the value of benign and malignant tumour
ax = sns.countplot(y,label="Count")
B, M = y.value_counts()
print('Number of Benign: ',B)
print('Number of Malignant : ',M)

In [None]:
#Set the background to white and use color codes
sns.set(style="white", color_codes=True)

In [None]:
# Show the heatmap
f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(x.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)

In [None]:
# Drop the correlated features
drop_list1 = ['perimeter_mean','radius_mean','compactness_mean','concave points_mean','radius_se','perimeter_se','radius_worst','perimeter_worst','compactness_worst','concave points_worst','compactness_se','concave points_se','texture_worst','area_worst']
x_1 = x.drop(drop_list1,axis = 1 )        # do not modify x, we will use it later
x_1.head()

In [None]:
# Show the heatmap after dropping correlated features
f,ax = plt.subplots(figsize=(14, 14))
sns.heatmap(x_1.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)

In [None]:
# Encode the label
y = df['diagnosis'].replace({'M': 1, 'B': 0})

# Create a new DataFrame combining x_1 and y
df_combined = x_1.copy()
df_combined['diagnosis'] = y

# Plot histograms for each feature in x_1, separated by diagnosis
features = x_1.columns

for feature in features:
    plt.figure(figsize=(10, 6))
    sns.histplot(data=df_combined, x=feature, hue='diagnosis', element='step', stat='density', common_norm=False)
    plt.title(f'Histogram of {feature} by Diagnosis')
    plt.xlabel(feature)
    plt.ylabel('Density')
    plt.legend(title='Diagnosis', labels=['Benign', 'Malignant'])
    plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import os

data_dia = y
data = x_1

# Perform standardization
data_n_2 = (data - data.mean()) / (data.std())

# Iterate through all features and create a separate violin plot for each
for feature in data.columns:
    # Prepare data for plotting
    plot_data = pd.concat([y, data_n_2[[feature]]], axis=1)
    plot_data = pd.melt(plot_data, id_vars="diagnosis", var_name="features", value_name='value')

    # Create the plot
    plt.figure(figsize=(10, 6))
    sns.violinplot(x="features", y="value", hue="diagnosis", data=plot_data, split=True, inner="quart")
    plt.title(f'Violin Plot of {feature} by Diagnosis')
    plt.xlabel('Features')
    plt.ylabel('Value')
    plt.xticks(rotation=90)
    plt.tight_layout()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

features = x_1.columns

# Create separate boxplots for each feature
for feature in features:
    plt.figure(figsize=(8, 6))  # Adjust figure size as needed
    sns.boxplot(x=df['diagnosis'], y=df[feature])
    plt.title(f'Boxplot of {feature} by Diagnosis')
    plt.xlabel('Diagnosis')
    plt.ylabel(feature)
    plt.tight_layout()

#Data Preprocessing
Demonstrate understanding of data preprocessing techniques by performing any of the following, as needed, according to the chosen dataset: (minimum 2)
1.	Perform data cleaning, transformation, discretization, and normalization.
2.	Select and justify data sampling techniques.
3.	Remove any irrelevant data or outliers.
4.	Feature selection and feature engineering.


In [None]:
data.columns

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Assuming df is your DataFrame
# Drop unnecessary columns
drop_list1 = ['id', 'perimeter_mean', 'radius_mean', 'compactness_mean', 'concave points_mean', 'radius_se',
              'perimeter_se', 'radius_worst', 'perimeter_worst', 'compactness_worst', 'concave points_worst',
              'compactness_se', 'concave points_se', 'texture_worst', 'area_worst']
x_1 = df.drop(drop_list1, axis=1)

# Encode the target variable
y = df['diagnosis'].replace({'M': 1, 'B': 0})

# Create a new DataFrame combining x_1 and y
df_combined = x_1.copy()
df_combined['diagnosis'] = y

# Data cleaning
# Missing values before dropping columns
missing_values_before = df_combined.isnull().sum()
print("*Missing values in each column (Before Dropping Columns):\n", missing_values_before)

# Check for duplicate rows
duplicate = df_combined.duplicated().sum()
print("*Number of duplicate rows:", duplicate)

# Drop columns with all missing values
X = df_combined.dropna(axis=1, how='all')

# Print column names with all missing values
print("Columns with all missing values (After Dropping Columns):")
print(X.columns[X.isnull().all()])

# Missing values after dropping columns
missing_values_after = X.isnull().sum()
print("*Missing values in each column (After Dropping Columns):\n", missing_values_after)

# Define features and target
X = X.drop('diagnosis', axis=1)
y = df_combined['diagnosis']

# Split the data into training and test sets using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


3. Handling Outlier

In [None]:
# Handle outliers and impute missing values
def handle_outliers_and_impute(df):
    df_cleaned = df.copy()

    # Dictionaries to store outlier information
    outliers_before = {}
    outliers_after = {}

    for col in df_cleaned.columns:
        if pd.api.types.is_numeric_dtype(df_cleaned[col]):
            Q1 = df_cleaned[col].quantile(0.25)
            Q3 = df_cleaned[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            # Identify outliers
            outliers_before[col] = (df_cleaned[col] < lower_bound) | (df_cleaned[col] > upper_bound)
            num_outliers_before = outliers_before[col].sum()

            # Impute outliers with median
            median = df_cleaned[col].median()
            df_cleaned.loc[outliers_before[col], col] = median

            # Verify the number of outliers after imputation
            outliers_after[col] = (df_cleaned[col] < lower_bound) | (df_cleaned[col] > upper_bound)
            num_outliers_after = outliers_after[col].sum()

    # Print total outliers before and after imputation
    print("\nTotal number of outliers per column before imputation:")
    for col, outliers in outliers_before.items():
        print(f"{col}: {outliers.sum()}")

    print("\nTotal number of outliers per column after imputation:")
    for col, outliers in outliers_after.items():
        print(f"{col}: {outliers.sum()}")

    return df_cleaned

# Handle outliers and impute missing values in training data
X_train_cleaned = handle_outliers_and_impute(X_train)
y_train_cleaned = y_train.reset_index(drop=True)


SMOTE (Data Sampling techniquen to remove bias)

In [None]:
from imblearn.over_sampling import SMOTE

# Define SMOTE
smote = SMOTE(random_state=42)

# Apply SMOTE
X_train_res, y_train_res = smote.fit_resample(X_train_cleaned, y_train)

# Convert to DataFrames for easier visualization
df_before = pd.DataFrame({'class': y_train})
df_after = pd.DataFrame({'class': y_train_res})

# Plot the class distribution before SMOTE
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.countplot(x='class', data=df_before)
plt.title('Class Distribution Before SMOTE')

# Plot the class distribution after SMOTE
plt.subplot(1, 2, 2)
sns.countplot(x='class', data=df_after)
plt.title('Class Distribution After SMOTE')
plt.tight_layout()
plt.show()

# Display class counts before and after SMOTE
print("Class distribution before SMOTE:")
print(df_before['class'].value_counts())

print("\nClass distribution after SMOTE:")
print(df_after['class'].value_counts())

# Display a sample of the resampled data
sample_size = 5
print("\nSample of resampled X_train_res:")
print(pd.DataFrame(X_train_res).head(sample_size))

print("\nSample of resampled y_train_res:")
print(pd.Series(y_train_res).head(sample_size))




4. Feature Selection and Feature Engineering

In [None]:
from sklearn.preprocessing import PolynomialFeatures

def feature_engineering(df):

    numeric_features = df.select_dtypes(include=['number']).columns
    categorical_features = df.select_dtypes(include=['object']).columns

    # Polynomial features
    poly = PolynomialFeatures(degree=2, include_bias=False)
    poly_features = poly.fit_transform(df[numeric_features])
    poly_feature_names = poly.get_feature_names_out(numeric_features)

    # Create DataFrame with polynomial features
    df_poly = pd.DataFrame(poly_features, columns=poly_feature_names)

    # Combine with original features
    df_combined = pd.concat([df.reset_index(drop=True), df_poly.reset_index(drop=True)], axis=1)

    # Drop duplicates if any
    df_combined = df_combined.loc[:, ~df_combined.columns.duplicated()]

    return df_combined

# Apply feature engineering
X_train_fe = feature_engineering(X_train_res)
X_test_fe = feature_engineering(X_test)





In [None]:
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

# Feature Selection
def feature_selection(X_train_fe, y_train):

    numeric_features = X_train_fe.select_dtypes(include=['number']).columns
    categorical_features = X_train_fe.select_dtypes(include=['object']).columns

    # Define preprocessing steps for feature selection
    preprocess = ColumnTransformer(
        transformers=[
            ('num', SimpleImputer(strategy='mean'), numeric_features),
            ('cat', OneHotEncoder(), categorical_features)
        ])

    # Apply preprocessing
    X_train_preprocessed = preprocess.fit_transform(X_train_fe)

    # RFE with RandomForest
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    rfe = RFE(estimator=model, n_features_to_select=10)
    X_train_rfe = rfe.fit_transform(X_train_preprocessed, y_train)
    rfe_df = pd.DataFrame({'Feature': preprocess.get_feature_names_out(), 'RFE Ranking': rfe.ranking_}).sort_values(by='RFE Ranking')

    return rfe_df

# Apply feature selection to feature-engineered data
rfe_df = feature_selection(X_train_res, y_train_res)

print("\nRFE Ranking:")
print(rfe_df)

In [None]:
# Select top features based on RFE ranking
top_features = rfe_df[rfe_df['RFE Ranking'] == 1]['Feature']

# Extract the original feature names by removing the 'num__' or 'cat__' prefix
original_feature_names = [name.split('__')[1] for name in top_features]

# Select the top features from the resampled X_train_res and X_test_fe
X_train_selected = X_train_res[original_feature_names]
X_test_selected = X_test_fe[original_feature_names]

# Display the top selected features
print("\nTop selected features based on RFE ranking:")
print(original_feature_names)

print("Shape of X_train_selected:", X_train_selected.shape)
print("Shape of y_train_res:", y_train_res.shape)


#Data Mining Methods
*	Develop data mining models to achieve the objective in Task 1. (Minimum 3 models)  
*	Evaluate the data mining models to ensure that the model is accurate and reliable.  
*	Perform hyperparameter tuning or revisit the preprocessing task to improve the models’ performances.  
*	Discuss the results.  


##KNN


Training with Dataset only after remove missing value


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import matplotlib.pyplot as plt

# Range of k values
k_range = range(1, 10)
train_accuracy1 = []
test_accuracy1 = []

# Loop through k values
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    cv_scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
    train_accuracy1.append(np.mean(cv_scores))
    test_accuracy1.append(np.mean(cv_scores))

# Plot accuracy
plt.plot(k_range, train_accuracy1, label='Train Accuracy')
plt.plot(k_range, test_accuracy1, label='Test Accuracy')
plt.xlabel('Number of Neighbors K')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the KNN model
knn = KNeighborsClassifier(n_neighbors=4)

# Train the KNN model
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred_knn = knn.predict(X_test)

# Evaluate the KNN model
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f'KNN Accuracy (After Dropping Duplicates and Handling Missing Values): {accuracy_knn:.2f}')

# Display classification report
print("\nKNN Classification Report (After Dropping Duplicates and Handling Missing Values):")
print(classification_report(y_test, y_pred_knn))

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred_knn)

# Output confusion matrix values
print("\nKNN Confusion Matrix (After Dropping Duplicates and Handling Missing Values):")
print(cm)

# Plot confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='g', cmap='YlGnBu',
            xticklabels=['Class 0', 'Class 1'],
            yticklabels=['Class 0', 'Class 1'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix Heatmap')
plt.show()


KNN model after dropping unnecessary value and outlier

In [None]:
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train_cleaned, y_train)
y_pred_knn = knn.predict(X_test)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f'KNN Accuracy (After Removing Outliers and Handling Missing Values): {accuracy_knn:.2f}')
print("\nKNN Classification Report (After Removing Outliers and Handling Missing Values):")
print(classification_report(y_test, y_pred_knn))
cm_knn = confusion_matrix(y_test, y_pred_knn)
print("\nKNN Confusion Matrix (After Removing Outliers and Handling Missing Values):")
print(cm_knn)

# Plot heatmap
plt.figure(figsize=(10, 7))
sns.heatmap(cm_knn, annot=True, fmt='g', cmap='YlGnBu',
            xticklabels=['Class 0', 'Class 1'],
            yticklabels=['Class 0', 'Class 1'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix Heatmap (After Removing Outliers and Handling Missing Values)')
plt.show()

KNN model after dropping unnessary value and outlier and applying SMOTE

In [None]:
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train_res, y_train_res)
y_pred_knn_smote = knn.predict(X_test)
accuracy_knn_smote = accuracy_score(y_test, y_pred_knn_smote)
print(f'KNN Accuracy (After SMOTE): {accuracy_knn_smote:.2f}')
print("\nKNN Classification Report (After SMOTE):")
print(classification_report(y_test, y_pred_knn_smote))
cm_knn_smote = confusion_matrix(y_test, y_pred_knn_smote)
print("\nKNN Confusion Matrix (After SMOTE):")
print(cm_knn_smote)

# Plot heatmap
plt.figure(figsize=(10, 7))
sns.heatmap(cm_knn_smote, annot=True, fmt='g', cmap='YlGnBu',
            xticklabels=['Class 0', 'Class 1'],
            yticklabels=['Class 0', 'Class 1'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix Heatmap (After SMOTE)')
plt.show()

after all preprocessing step

In [None]:
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train_selected, y_train_res)
y_pred_knn_selected = knn.predict(X_test_selected)
accuracy_knn = accuracy_score(y_test, y_pred_knn_selected)
print(f'KNN Accuracy (After Feature Selection): {accuracy_knn:.2f}')
print("\nKNN Classification Report (After Feature Selection):")
print(classification_report(y_test, y_pred_knn_selected))
cm_knn_selected = confusion_matrix(y_test, y_pred_knn_selected)
print("\nKNN Confusion Matrix (After Feature Selection):")
print(cm_knn_selected)

# Plot heatmap
plt.figure(figsize=(10, 7))
sns.heatmap(cm_knn_selected, annot=True, fmt='g', cmap='YlGnBu',
            xticklabels=['Class 0', 'Class 1'],
            yticklabels=['Class 0', 'Class 1'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix Heatmap (After Feature Selection)')
plt.show()

##NAIVE BAYES

Training with Dataset only after remove missing value


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

def plot_confusion_matrix(cm, title):
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='g', cmap='YlGnBu',
                xticklabels=['Class 0', 'Class 1'],
                yticklabels=['Class 0', 'Class 1'])
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title(title)
    plt.show()

In [None]:
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f'Naive Bayes Accuracy: {accuracy_nb:.2f}')
print("\nNaive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))
cm_nb = confusion_matrix(y_test, y_pred_nb)
print("\nNaive Bayes Confusion Matrix:")
print(cm_nb)
plot_confusion_matrix(cm_nb, 'Confusion Matrix Heatmap (After removing missing and duplicate data')



model after dropping unnessary value and outlier

In [None]:
nb = GaussianNB()
nb.fit(X_train_cleaned, y_train)
y_pred_nb = nb.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f'Naive Bayes Accuracy (After Outlier Handling and Imputation): {accuracy_nb:.2f}')
print("\nNaive Bayes Classification Report (After Outlier Handling and Imputation):")
print(classification_report(y_test, y_pred_nb))
cm_nb = confusion_matrix(y_test, y_pred_nb)
print("\nNaive Bayes Confusion Matrix (After Outlier Handling and Imputation):")
print(cm_nb)
plot_confusion_matrix(cm_nb, 'Confusion Matrix Heatmap (After Outlier Handling and Imputation)')


KNN model after dropping unnessary value and outlier and applying SMOTE

In [None]:
nb = GaussianNB()
nb.fit(X_train_res, y_train_res)
y_pred_nb = nb.predict(X_test)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f'Naive Bayes Accuracy (After SMOTE): {accuracy_nb:.2f}')
print("\nNaive Bayes Classification Report (After SMOTE):")
print(classification_report(y_test, y_pred_nb))
cm_nb = confusion_matrix(y_test, y_pred_nb)
print("\nNaive Bayes Confusion Matrix (After SMOTE):")
print(cm_nb)
plot_confusion_matrix(cm_nb, 'Confusion Matrix Heatmap (After SMOTE)')

after all preprocessing step

In [None]:
nb = GaussianNB()
nb.fit(X_train_selected, y_train_res)
y_pred_nb = nb.predict(X_test_selected)
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f'Naive Bayes Accuracy: {accuracy_nb:.2f}')
print("\nNaive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))
cm_nb = confusion_matrix(y_test, y_pred_nb)
print("\nNaive Bayes Confusion Matrix:")
print(cm_nb)
plot_confusion_matrix(cm_nb, 'Confusion Matrix Heatmap (After Feature Selection)')

##Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


In [None]:
# from sklearn.model_selection import GridSearchCV
# param_grid = {
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'criterion': ['gini', 'entropy']
# }
# grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5)
# grid_search.fit(X_train_selected, y_train_res)
# best_model = grid_search.best_estimator_


In [None]:
# best_params = grid_search.best_params_
# print(f"Best Parameters: {best_params}")

In [None]:
# Create an instance of the Decision tree classifier
dt_clf = DecisionTreeClassifier(criterion='entropy', min_samples_split=2, min_samples_leaf=1, random_state=42)

# Train the model on the training data after feature selection
dt_clf.fit(X_train_selected, y_train_res)

# Make a prediction
y_pred = dt_clf.predict(X_test_selected)


In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Generate a classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# Display the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
plot_confusion_matrix(confusion_matrix(y_test, y_pred), 'Confusion Matrix Heatmap (After Feature Selection)')

In [None]:
# Create an instance of the Decision tree classifier
dt_clf = DecisionTreeClassifier(criterion='entropy', min_samples_split=2, min_samples_leaf=1, random_state=42)

# Train the model on the training data before feature selection
dt_clf.fit(X_train_res, y_train_res)

# Make a prediction
y_pred = dt_clf.predict(X_test)


In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Generate a classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# Display the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
plot_confusion_matrix(confusion_matrix(y_test, y_pred), 'Confusion Matrix Heatmap (Before Feature Selection)')

In [None]:
# Create an instance of the Decision tree classifier
dt_clf = DecisionTreeClassifier(criterion='entropy', min_samples_split=2, min_samples_leaf=1, random_state=42)

# Train the model on the training data before handling outliers
dt_clf.fit(X_train, y_train)

# Make a prediction
y_pred = dt_clf.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Generate a classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# Display the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
plot_confusion_matrix(confusion_matrix(y_test, y_pred), 'Confusion Matrix Heatmap (Before Handling Outliers)')

##Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

In [None]:
# Perform a grid search to find the best parameters
param_grid = {
    'max_depth': [10, 20, None],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'n_estimators': [100, 200]
}

In [None]:
# #Create a Random forest classifier
# model = RandomForestClassifier(random_state=42)

# # Initialize the a GridSearchCV object
# grid_search = GridSearchCV(
#     estimator=model,
#     param_grid=param_grid,
#     cv=5,                            # Number of folds in cross-validation
#     scoring='accuracy',              # Metric to evaluate
#     n_jobs=-1,                       # Use all available cores
#     verbose=1                        # Print progress
# )

In [None]:
# # Run the grid search on the training data
# grid_search.fit(X_train_selected, y_train_res)

# # Get the best parameters from the grid search
# best_params = grid_search.best_params_
# print(f"Best Parameters: {best_params}")


In [None]:
# Create an instance of the random forest classifier with 100 number of trees
rf_clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    max_features='sqrt',
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

# Fit the data after feature selection into the model
rf_clf.fit(X_train_selected, y_train_res)

# Make a prediction using the model
y_pred = rf_clf.predict(X_test_selected)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Generate a classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# Display the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
plot_confusion_matrix(confusion_matrix(y_test, y_pred), 'Confusion Matrix Heatmap (After Feature Selection)')

In [None]:
# Create an instance of the random forest classifier with 100 number of trees
rf_clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    max_features='sqrt',
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

# Fit the data before feature selection into the model
rf_clf.fit(X_train_res, y_train_res)

# Make a prediction using the model
y_pred = rf_clf.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Generate a classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# Display the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
plot_confusion_matrix(confusion_matrix(y_test, y_pred), 'Confusion Matrix Heatmap (Before Feature Selection)')

In [None]:
# Create an instance of the random forest classifier with 100 number of trees
rf_clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    max_features='sqrt',
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)

# Fit the data before handling outliers into the model
rf_clf.fit(X_train, y_train)

# Make a prediction using the model
y_pred = rf_clf.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Generate a classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# Display the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
plot_confusion_matrix(confusion_matrix(y_test, y_pred), 'Confusion Matrix Heatmap (Before Handling Outliers)')

## Extra Trees Classifier

Training with Dataset after all preprocessing


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
#from sklearn.model_selection import GridSearchCV
#
#param_grid = {
  #  'n_estimators': [50, 100, 200],
   # 'max_features': ['auto', 'sqrt', 'log2']
#
#
#grid_search = GridSearchCV(ExtraTreesClassifier(random_state=42), param_grid, cv=5)
#grid_search.fit(X_train_selected, y_train_res)
#
#print("Best Parameters: ", grid_search.best_params_)

In [None]:
# Initialize the Extra Trees Classifier
etc = ExtraTreesClassifier(n_estimators=100, random_state=42)

# Train the model using the selected features
etc.fit(X_train_selected, y_train_res)


In [None]:
# Reorder X_test_selected columns to match X_train_selected
X_test_selected = X_test_selected[X_train_selected.columns]

# Predict on the test set
y_pred = etc.predict(X_test_selected)



In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Generate a classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Display the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

plot_confusion_matrix(confusion_matrix(y_test, y_pred), 'Confusion Matrix Heatmap (After all preprocessing)')

Training with Dataset after feature engineering


In [None]:
# Initialize the Extra Trees Classifier
etc = ExtraTreesClassifier(n_estimators=100, random_state=42)

# Train the model using the feature-engineered training set
etc.fit(X_train_fe, y_train_res)

# Reorder X_test_fe columns to match X_train_fe
X_test_fe = X_test_fe[X_train_fe.columns]

# Predict on the test set
y_pred = etc.predict(X_test_fe)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Generate a classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Display the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
plot_confusion_matrix(confusion_matrix(y_test, y_pred), 'Confusion Matrix Heatmap (After Feature Engineering)')

Training with Dataset after imputation of outlier


In [None]:
# Initialize the Extra Trees Classifier
etc = ExtraTreesClassifier(n_estimators=100, random_state=42)

# Train the model using the cleaned training set
etc.fit(X_train_cleaned, y_train_cleaned)

# Predict on the cleaned test set
y_pred = etc.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Generate a classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Display the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
plot_confusion_matrix(confusion_matrix(y_test, y_pred), 'Confusion Matrix Heatmap (After Handling Outliers)')

Training with Dataset Before imputation of outlier

In [None]:
# Initialize the Extra Trees Classifier
etc = ExtraTreesClassifier(n_estimators=100, random_state=42)

# Train the model using the data before outlier imputation
etc.fit(X_train, y_train)

# Predict on the test set
y_pred = etc.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Generate a classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Display the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
plot_confusion_matrix(confusion_matrix(y_test, y_pred), 'Confusion Matrix Heatmap (Before Handling Outliers)')

##Support Vector Machine


Training with Dataset after all preprocessing


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
# Define the Support Vector Machine model with a linear kernel
svm_model = SVC(kernel='linear', random_state=42)

# Train the model
svm_model.fit(X_train_selected, y_train_res)


In [None]:
# Predict on the test set
y_pred = svm_model.predict(X_test_selected)


In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)
plot_confusion_matrix(confusion_matrix(y_test, y_pred), 'Confusion Matrix Heatmap (After All Preprocessing)')

Training with Dataset after feature engineering

In [None]:
# Define the Support Vector Machine model with a linear kernel
svm_model = SVC(kernel='linear', random_state=42)

# Train the model
svm_model.fit(X_train_fe, y_train_res)

# Predict on the test set
y_pred = svm_model.predict(X_test_fe)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)
plot_confusion_matrix(confusion_matrix(y_test, y_pred), 'Confusion Matrix Heatmap (After Feature Engineering)')

Training with Dataset after Imputation of Outlier

In [None]:
# Define the Support Vector Machine model with a linear kernel
svm_model = SVC(kernel='linear', random_state=42)

# Train the model
svm_model.fit(X_train_cleaned, y_train_cleaned)

# Predict on the test set
y_pred = svm_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)
plot_confusion_matrix(confusion_matrix(y_test, y_pred), 'Confusion Matrix Heatmap (After Handling Outliers)')

Training with Dataset before Imputation of Outlier

In [None]:
# Define the Support Vector Machine model with a linear kernel
svm_model = SVC(kernel='linear', random_state=42)

# Train the model
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Display confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)
plot_confusion_matrix(confusion_matrix(y_test, y_pred), 'Confusion Matrix Heatmap (Before Handling Outliers)')