## Decision Tree Classifier

In [57]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.impute import SimpleImputer
np.random.seed(0)

dataset_files = ['titanic2_iqr_median.csv',
                 'titanic4_stdmean_median.csv',
                 'titanic5_percentile_median.csv',
                 'titanic6_percentile_remove.csv']

for dataset_file in dataset_files:
    df = pd.read_csv(dataset_file)

    X = df.drop('Survived', axis=1)  
    y = df['Survived']
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
    test_size=0.3, random_state=42)
    
    imputer = SimpleImputer(strategy='median')
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)
 
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train_imputed, y_train)

    y_pred = model.predict(X_test_imputed)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"\nDecision Tree Classifier:")
    print(f'confusion matrix:')
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(f"Accuracy for {dataset_file}: {accuracy}")
    print(f'Accuracy: {accuracy:.2f}')
    print(f'F1_Score: {f1:.2f}')



Decision Tree Classifier:
confusion matrix:
[[124  35]
 [ 26  82]]
              precision    recall  f1-score   support

           0       0.83      0.78      0.80       159
           1       0.70      0.76      0.73       108

    accuracy                           0.77       267
   macro avg       0.76      0.77      0.77       267
weighted avg       0.78      0.77      0.77       267

Accuracy for titanic2_iqr_median.csv: 0.7715355805243446
Accuracy: 0.77
F1_Score: 0.73

Decision Tree Classifier:
confusion matrix:
[[127  32]
 [ 28  80]]
              precision    recall  f1-score   support

           0       0.82      0.80      0.81       159
           1       0.71      0.74      0.73       108

    accuracy                           0.78       267
   macro avg       0.77      0.77      0.77       267
weighted avg       0.78      0.78      0.78       267

Accuracy for titanic4_stdmean_median.csv: 0.7752808988764045
Accuracy: 0.78
F1_Score: 0.73

Decision Tree Classifier:
confu

## Random Forest Classifier

In [54]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np
import random
np.random.seed(0)

dataset_files = ['titanic2_iqr_median.csv',
                 'titanic4_stdmean_median.csv',
                 'titanic5_percentile_median.csv',
                 'titanic6_percentile_remove.csv']

for dataset_file in dataset_files:
    df = pd.read_csv(dataset_file)
    
    X = df.drop('Survived', axis=1)  
    y = df['Survived']
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
    test_size=0.3, random_state=42)
    
    imputer = SimpleImputer(strategy='median')
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)

    model = RandomForestClassifier()
    model.fit(X_train_imputed, y_train)

    y_pred = model.predict(X_test_imputed)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"\nRandom Forest Classifier:")
    print(f'confusion matrix:')
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(f"{dataset_file}")
    print(f'Accuracy: {accuracy:.2f}')
    print(f'F1_Score: {f1:.2f}')


Random Forest Classifier:
confusion matrix:
[[131  28]
 [ 29  79]]
              precision    recall  f1-score   support

           0       0.82      0.82      0.82       159
           1       0.74      0.73      0.73       108

    accuracy                           0.79       267
   macro avg       0.78      0.78      0.78       267
weighted avg       0.79      0.79      0.79       267

titanic2_iqr_median.csv
Accuracy: 0.79
F1_Score: 0.73

Random Forest Classifier:
confusion matrix:
[[131  28]
 [ 35  73]]
              precision    recall  f1-score   support

           0       0.79      0.82      0.81       159
           1       0.72      0.68      0.70       108

    accuracy                           0.76       267
   macro avg       0.76      0.75      0.75       267
weighted avg       0.76      0.76      0.76       267

titanic4_stdmean_median.csv
Accuracy: 0.76
F1_Score: 0.70

Random Forest Classifier:
confusion matrix:
[[132  27]
 [ 32  76]]
              precision    rec

## Logistic Regression

In [55]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import random
np.random.seed(0)

dataset_files = ['titanic2_iqr_median.csv',
                 'titanic4_stdmean_median.csv',
                 'titanic5_percentile_median.csv',
                 'titanic6_percentile_remove.csv']

for dataset_file in dataset_files:
    df = pd.read_csv(dataset_file)
    X = df.drop('Survived', axis=1) 
    y = df['Survived']

    X_train, X_test, y_train, y_test = train_test_split(X, y, 
    test_size=0.3, random_state=42)
    
    imputer = SimpleImputer(strategy='median')
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)
    
    model = LogisticRegression()
    model.fit(X_train_imputed, y_train)

    y_pred = model.predict(X_test_imputed)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"\nLogistic Regression")
    print(f'confusion matrix:')
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(f"Accuracy for {dataset_file}: {accuracy}")
    print(f'Accuracy: {accuracy:.2f}')
    print(f'F1_Score: {f1:.2f}')
    


Logistic Regression
confusion matrix:
[[132  27]
 [ 36  72]]
              precision    recall  f1-score   support

           0       0.79      0.83      0.81       159
           1       0.73      0.67      0.70       108

    accuracy                           0.76       267
   macro avg       0.76      0.75      0.75       267
weighted avg       0.76      0.76      0.76       267

Accuracy for titanic2_iqr_median.csv: 0.7640449438202247
Accuracy: 0.76
F1_Score: 0.70

Logistic Regression
confusion matrix:
[[132  27]
 [ 34  74]]
              precision    recall  f1-score   support

           0       0.80      0.83      0.81       159
           1       0.73      0.69      0.71       108

    accuracy                           0.77       267
   macro avg       0.76      0.76      0.76       267
weighted avg       0.77      0.77      0.77       267

Accuracy for titanic4_stdmean_median.csv: 0.7715355805243446
Accuracy: 0.77
F1_Score: 0.71

Logistic Regression
confusion matrix:
[[135