In [3]:
%run Algorithms.ipynb

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import time

In [6]:
datasets = {
    'education_2_df': 'Datasets/predict-dropout-or-academic-success.csv'
}
anomalous_datasets = {
    'a_education_2_df': 'Datasets/predict-dropout-or-academic-success_anomaly.csv'
}


finance_1_df = pd.read_csv('Datasets/bank-customer-churn-prediction.csv')
finance_2_df = pd.read_csv('Datasets/financial-risk-for-loan-approval.csv')
finance_3_df = pd.read_csv('Datasets/loan-approval-classification-dataset.csv')

education_1_df = pd.read_csv('Datasets/campus-placement-prediction.csv')
education_2_df = pd.read_csv('Datasets/predict-dropout-or-academic-success.csv')
education_3_df = pd.read_csv('Datasets/student-performance-dataset.csv')

health_1_df = pd.read_csv('Datasets/fetal-health-classifiation.csv')
health_2_df = pd.read_csv('Datasets/heart-disease-health-indicators-dataset.csv')
health_3_df = pd.read_csv('Datasets/patient-treatment-classification.csv')

In [5]:
for df_name, file_path in datasets.items():
    df = pd.read_csv(file_path)
    used_cols = [col for col in df.columns if df[col].nunique() > 50]
    print(used_cols)

['HAEMATOCRIT', 'HAEMOGLOBINS', 'ERYTHROCYTE', 'LEUCOCYTE', 'THROMBOCYTE', 'MCH', 'MCHC', 'MCV', 'AGE']


In [49]:
def classify(dataframe: pd.DataFrame, filename: str) -> None:
    # Drop rows with missing values
    total_shape = dataframe.shape[0]
    print(total_shape)

    dropped_indices = [(index, col) for index, row in dataframe.iterrows() for col in dataframe.columns if pd.isna(row[col])]

    
    dataframe = dataframe.dropna()

    # Features and target
    if filename in ['Datasets/campus-placement-prediction.csv', 'Datasets/campus-placement-prediction_anomaly.csv']:
        X = dataframe.drop(columns=['status'])
        y = dataframe['status']
    elif filename in ['Datasets/predict-dropout-or-academic-success.csv', 'Datasets/predict-dropout-or-academic-success_anomaly.csv']:
        X = dataframe.drop(columns=['Target'])
        y = dataframe['Target']
    elif filename in ['Datasets/student-performance-dataset.csv', 'Datasets/student-performance-dataset_anomaly.csv']:
        X = dataframe.drop(columns=['GradeClass'])
        y = dataframe['GradeClass']
    elif filename in ['Datasets/bank-customer-churn-prediction.csv', 'Datasets/bank-customer-churn-prediction_anomaly.csv']:
        X = dataframe.drop(columns=['churn'])
        y = dataframe['churn']
    elif filename in ['Datasets/financial-risk-for-loan-approval.csv', 'Datasets/financial-risk-for-loan-approval_anomaly.csv']:
        X = dataframe.drop(columns=['LoanApproved'])
        y = dataframe['LoanApproved']
    elif filename in ['Datasets/loan-approval-classification-dataset.csv', 'Datasets/loan-approval-classification-dataset_anomaly.csv']:
        X = dataframe.drop(columns=['loan_status'])
        y = dataframe['loan_status']
    elif filename in ['Datasets/fetal-health-classifiation.csv', 'Datasets/fetal-health-classifiation_anomaly.csv']:
        X = dataframe.drop(columns=['fetal_health'])
        y = dataframe['fetal_health']
    elif filename in ['Datasets/heart-disease-health-indicators-dataset.csv', 'Datasets/heart-disease-health-indicators-dataset_anomaly.csv']:
        X = dataframe.drop(columns=['HeartDiseaseorAttack'])
        y = dataframe['HeartDiseaseorAttack']
    elif filename in ['Datasets/patient-treatment-classification.csv', 'Datasets/patient-treatment-classification_anomaly.csv']:
        X = dataframe.drop(columns=['SOURCE'])
        y = dataframe['SOURCE']
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    print(f"Dataset: {filename}")
    print(f"Shape after dropping missing values: {dataframe.shape}")
    print(f"Test set size: {X_train.shape}")
    
    # Initialize and train RandomForestClassifier
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_classifier.fit(X_train, y_train)
    
    # Predict on test data
    y_pred = rf_classifier.predict(X_test)
    
    # Evaluate performance
    current_shape = dataframe.shape[0]
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_whole = accuracy * (current_shape / total_shape)
    print("Accuracy:", accuracy)
    print("Accuracy Whole:", accuracy_whole)
    return dropped_indices

In [46]:
def classify_graph(dataframe: pd.DataFrame, filename: str) -> None:
    # Features and target
    if filename in ['Datasets/campus-placement-prediction.csv', 'Datasets/campus-placement-prediction_anomaly.csv']:
        X = dataframe.drop(columns=['status'])
        y = dataframe['status']
    elif filename in ['Datasets/predict-dropout-or-academic-success.csv', 'Datasets/predict-dropout-or-academic-success_anomaly.csv']:
        X = dataframe.drop(columns=['Target'])
        y = dataframe['Target']
    elif filename in ['Datasets/student-performance-dataset.csv', 'Datasets/student-performance-dataset_anomaly.csv']:
        X = dataframe.drop(columns=['GradeClass'])
        y = dataframe['GradeClass']
    elif filename in ['Datasets/bank-customer-churn-prediction.csv', 'Datasets/bank-customer-churn-prediction_anomaly.csv']:
        X = dataframe.drop(columns=['churn'])
        y = dataframe['churn']
    elif filename in ['Datasets/financial-risk-for-loan-approval.csv', 'Datasets/financial-risk-for-loan-approval_anomaly.csv']:
        X = dataframe.drop(columns=['LoanApproved'])
        y = dataframe['LoanApproved']
    elif filename in ['Datasets/loan-approval-classification-dataset.csv', 'Datasets/loan-approval-classification-dataset_anomaly.csv']:
        X = dataframe.drop(columns=['loan_status'])
        y = dataframe['loan_status']
    elif filename in ['Datasets/fetal-health-classifiation.csv', 'Datasets/fetal-health-classifiation_anomaly.csv']:
        X = dataframe.drop(columns=['fetal_health'])
        y = dataframe['fetal_health']
    elif filename in ['Datasets/heart-disease-health-indicators-dataset.csv', 'Datasets/heart-disease-health-indicators-dataset_anomaly.csv']:
        X = dataframe.drop(columns=['HeartDiseaseorAttack'])
        y = dataframe['HeartDiseaseorAttack']
    elif filename in ['Datasets/patient-treatment-classification.csv', 'Datasets/patient-treatment-classification_anomaly.csv']:
        X = dataframe.drop(columns=['SOURCE'])
        y = dataframe['SOURCE']
    
        
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize and train RandomForestRegressor
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_classifier.fit(X_train, y_train)
    
    # Predict on test data
    y_pred = rf_classifier.predict(X_test)
    
    # Evaluate performance
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    
    feature_importances = rf_classifier.feature_importances_
    plt.figure(figsize=(10, 8))
    plt.bar(X.columns, feature_importances)
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.title('Feature Importance')
    plt.xticks(rotation=90)  # Rotate labels 90 degrees (vertical)
    
    # Display the plot
    plt.tight_layout() 
    plt.show()



In [47]:
def classify_datasets() -> None:
    for df_name, file_path in datasets.items():
        df = pd.read_csv(file_path)
        df = preprocessing(df, file_path)
        classify(df, file_path)

In [48]:
def classify_anomalous() -> None:
    total_list = []
    for df_name, file_path in anomalous_datasets.items():
        df = pd.read_csv(file_path)
        df = preprocessing(df, file_path)
        x = classify(df, file_path)
        total_list.append(x)
    return total_list

In [41]:
def run_dbscan(eps) -> None:
    start_time = time.time()
    
    for df_name, file_path in anomalous_datasets.items():
        df = pd.read_csv(file_path)
        # Already have been preprocessed
        df = drop_DBSCAN(df, file_path, eps)
        classify(df, file_path)
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    print(f"Total time taken: {elapsed_time} seconds")

In [42]:
def run_ocsvm(nu, gamma) -> None:
    start_time = time.time()
    
    for df_name, file_path in anomalous_datasets.items():
        df = pd.read_csv(file_path)
        # Already have been preprocessed
        df = drop_OCSVM(df, file_path, nu, gamma)
        classify(df, file_path)
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    print(f"Total time taken: {elapsed_time} seconds")

In [43]:
def run_isolation_forest(threshold: float) -> None:
    start_time = time.time()
    
    for df_name, file_path in anomalous_datasets.items():
        df = pd.read_csv(file_path)
        # Already have been preprocessed
        df = drop_IF(df, file_path, threshold)
        classify(df, file_path)
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    print(f"Total time taken: {elapsed_time} seconds")