## Decision Tree Classifier

In [29]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
np.random.seed(0)

# List of dataset filenames
dataset_files = ['hd1_outlier_iqr.csv',
                 'hd2_outlier_std.csv',
                 'hd3_outlier_percentile.csv',
                 'hd4_outlier_remove.csv']

for dataset_file in dataset_files:
    df = pd.read_csv(dataset_file)

    X = df.drop('CHAS', axis=1)  
    y = df['CHAS']
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
    test_size=0.3, random_state=42)
    
    imputer = SimpleImputer(strategy='median')
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)
 
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train_imputed, y_train)

    y_pred = model.predict(X_test_imputed)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"\nDecision Tree Classifier:")
    print(f"Accuracy for {dataset_file}: {accuracy}")
    print(f'Accuracy: {accuracy:.2f}')


Decision Tree Classifier:
Accuracy for hd1_outlier_iqr.csv: 0.9342105263157895
Accuracy: 0.93

Decision Tree Classifier:
Accuracy for hd2_outlier_std.csv: 0.9342105263157895
Accuracy: 0.93

Decision Tree Classifier:
Accuracy for hd3_outlier_percentile.csv: 0.9276315789473685
Accuracy: 0.93

Decision Tree Classifier:
Accuracy for hd4_outlier_remove.csv: 0.9328859060402684
Accuracy: 0.93


## Random Forest Classifier

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np
import random
np.random.seed(0)

dataset_files = ['hd1_outlier_iqr.csv',
                 'hd2_outlier_std.csv',
                 'hd3_outlier_percentile.csv',
                 'hd4_outlier_remove.csv']

for dataset_file in dataset_files:
    df = pd.read_csv(dataset_file)

    X = df.drop('CHAS', axis=1)  
    y = df['CHAS']
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
    test_size=0.3, random_state=42)
    
    imputer = SimpleImputer(strategy='median')
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)

    model = RandomForestClassifier()
    model.fit(X_train_imputed, y_train)

    y_pred = model.predict(X_test_imputed)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"\nRandom Forest Classifier:")
    print(f"Accuracy for {dataset_file}: {accuracy}")
    print(f'Accuracy: {accuracy:.2f}')



Random Forest Classifier:
Accuracy for hd1_outlier_iqr.csv: 0.9407894736842105
Accuracy: 0.94

Random Forest Classifier:
Accuracy for hd2_outlier_std.csv: 0.9276315789473685
Accuracy: 0.93

Random Forest Classifier:
Accuracy for hd3_outlier_percentile.csv: 0.9210526315789473
Accuracy: 0.92

Random Forest Classifier:
Accuracy for hd4_outlier_remove.csv: 0.9395973154362416
Accuracy: 0.94


## Logistic Regression

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import random
np.random.seed(0)

dataset_files = ['hd1_outlier_iqr.csv',
                 'hd2_outlier_std.csv',
                 'hd3_outlier_percentile.csv',
                 'hd4_outlier_remove.csv']

for dataset_file in dataset_files:
    df = pd.read_csv(dataset_file)

    X = df.drop('CHAS', axis=1)  
    y = df['CHAS']
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
    test_size=0.3, random_state=42)
    
    imputer = SimpleImputer(strategy='median')
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)
    
    model = LogisticRegression()
    model.fit(X_train_imputed, y_train)

    y_pred = model.predict(X_test_imputed)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"\nLogistic Regression:")
    print(f"Accuracy for {dataset_file}: {accuracy}")
    print(f'Accuracy: {accuracy:.2f}')

    


Logistic Regression:
Accuracy for hd1_outlier_iqr.csv: 0.9342105263157895
Accuracy: 0.93

Logistic Regression:
Accuracy for hd2_outlier_std.csv: 0.9407894736842105
Accuracy: 0.94

Logistic Regression:
Accuracy for hd3_outlier_percentile.csv: 0.9407894736842105
Accuracy: 0.94

Logistic Regression:
Accuracy for hd4_outlier_remove.csv: 0.9395973154362416
Accuracy: 0.94
