In [2]:
import numpy as np
from pandas import read_csv

In [3]:
titanic_data_train = read_csv('datasets/titanic/titanic_training.csv')
titanic_data_test = read_csv('datasets/titanic/titanic_testing_data.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'datasets/titanic/titanic_training.csv'

In [4]:
pip install pydot

Collecting pydot
  Downloading pydot-3.0.2-py3-none-any.whl.metadata (10 kB)
Downloading pydot-3.0.2-py3-none-any.whl (35 kB)
Installing collected packages: pydot
Successfully installed pydot-3.0.2
Note: you may need to restart the kernel to use updated packages.


In [1]:

from collections import Counter

import numpy as np
from numpy import genfromtxt
import scipy.io
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import cross_val_score
from pydot import graph_from_dot_data
import io

import random
random.seed(246810)
np.random.seed(246810)

eps = 1e-5  # a small number


class DecisionTree:

    def __init__(self, max_depth=3, feature_labels=None):
        self.max_depth = max_depth
        self.features = feature_labels
        self.left, self.right = None, None  # for non-leaf nodes
        self.split_idx, self.thresh = None, None  # for non-leaf nodes
        self.data, self.pred = None, None  # for leaf nodes

    @staticmethod
    def entropy(y):
        label_counts = Counter(y)
        probabilities = [count / len(y) for count in label_counts.values()]
        return -sum(p * np.log2(p) for p in probabilities if p > 0)
        # pass

    @staticmethod
    def information_gain(X, y, thresh):
        # TODO
        entropy_before = entropy(y)
        X_left, y_left, X_right, y_right = self.split(X, y, idx, thresh)
        entropy_after = (len(y_left) / len(y) * entropy(y_left) + 
                         len(y_right) / len(y) * entropy(y_right))
        return entropy_before - entropy_after

        # return np.random.rand()

    @staticmethod
    def gini_impurity(X, y, thresh):
        # TODO
        pass

    @staticmethod
    def gini_purification(X, y, thresh):
        # TODO
        pass

    def split(self, X, y, idx, thresh):
        X0, idx0, X1, idx1 = self.split_test(X, idx=idx, thresh=thresh)
        y0, y1 = y[idx0], y[idx1]
        return X0, y0, X1, y1

    def split_test(self, X, idx, thresh):
        idx0 = np.where(X[:, idx] < thresh)[0]
        idx1 = np.where(X[:, idx] >= thresh)[0]
        X0, X1 = X[idx0, :], X[idx1, :]
        return X0, idx0, X1, idx1

    def fit(self, X, y):
        # TODO
        self.root = self._fit_tree(X, y, depth=0)
        #pass
    def _fit_tree(self, X, y, depth):
        if depth >= self.max_depth or len(np.unique(y)) == 1:
            return {"is_leaf": True, "prediction": Counter(y).most_common(1)[0][0]}

        best_gain = -1
        best_feature = None
        best_thresh = None
        for idx in range(X.shape[1]):
            thresholds = np.unique(X[:, idx])
            for thresh in thresholds:
                gain = self.information_gain(X, y, idx, thresh)
                if gain > best_gain:
                    best_gain = gain
                    best_feature, best_thresh = idx, thresh

        if best_gain == 0:
            return {"is_leaf": True, "prediction": Counter(y).most_common(1)[0][0]}

        X_left, y_left, X_right, y_right = self.split(X, y, best_feature, best_thresh)

        left_subtree = self._fit_tree(X_left, y_left, depth + 1)
        right_subtree = self._fit_tree(X_right, y_right, depth + 1)

        return {"is_leaf": False, "split_feature": best_feature, 
                "split_thresh": best_thresh, "left": left_subtree, 
                "right": right_subtree}

    def predict(self, X):
        # TODO
        #pass
        return np.array([self._predict_sample(sample, self.tree) for sample in X])
    def _predict_sample(self, sample, node):
        if node["is_leaf"]:
            return node["prediction"]
        if sample[node["split_feature"]] < node["split_thresh"]:
            return self._predict_sample(sample, node["left"])
        else:
            return self._predict_sample(sample, node["right"])

    

    def __repr__(self):
        if self.max_depth == 0:
            return "%s (%s)" % (self.pred, self.labels.size)
        else:
            return "[%s < %s: %s | %s]" % (self.features[self.split_idx],
                                           self.thresh, self.left.__repr__(),
                                           self.right.__repr__())


class BaggedTrees(BaseEstimator, ClassifierMixin):

    def __init__(self, params=None, n=200):
        if params is None:
            params = {}
        self.params = params
        self.n = n
        self.decision_trees = [
            DecisionTreeClassifier(random_state=i, **self.params)
            for i in range(self.n)
        ]

    def fit(self, X, y):
        # TODO
        # pass
        for tree in self.decision_trees:
            indices = np.random.randint(0, len(X), len(X))
            tree.fit(X[indices], y[indices])

    def predict(self, X):
        # TODO
        # pass
        predictions = np.array([tree.predict(X) for tree in self.decision_trees])
        return np.apply_along_axis(lambda x: np.bincount(x, minlength=2).argmax(), axis=0, arr=predictions)



class RandomForest(BaggedTrees):

    def __init__(self, params=None, n=200, m=1):
        if params is None:
            params = {}
        params['max_features'] = m
        self.m = m
        super().__init__(params=params, n=n)


class BoostedRandomForest(RandomForest):

    def fit(self, X, y):
        # TODO
        #pass
        sample_weights = np.ones(len(X)) / len(X)
        for _ in range(self.num_boosting_rounds):
            forest = RandomForest(params=self.params, n=self.n, m=self.m)
            forest.fit(X, y)
        self.boosted_forests.append(forest)
    
    def predict(self, X):
        # TODO
        # pass
        predictions = np.array([forest.predict(X) for forest in self.boosted_forests])
        final_predictions = np.apply_along_axis(lambda x: np.bincount(x, minlength=2).argmax(), axis=0, arr=predictions)
        return final_predictions

def preprocess(data, fill_mode=True, min_freq=10, onehot_cols=[]):
    # fill_mode = False

    # Temporarily assign -1 to missing data
    data[data == b''] = '-1'

    # Hash the columns (used for handling strings)
    onehot_encoding = []
    onehot_features = []
    for col in onehot_cols:
        counter = Counter(data[:, col])
        for term in counter.most_common():
            if term[0] == b'-1':
                continue
            if term[-1] <= min_freq:
                break
            onehot_features.append(term[0])
            onehot_encoding.append((data[:, col] == term[0]).astype(float))
        data[:, col] = '0'
    onehot_encoding = np.array(onehot_encoding).T
    data = np.hstack(
        [np.array(data, dtype=float),
         np.array(onehot_encoding)])

    # Replace missing data with the mode value. We use the mode instead of
    # the mean or median because this makes more sense for categorical
    # features such as gender or cabin type, which are not ordered.
    if fill_mode:
        # TODO
        # pass
        for col in range(data.shape[1]):
            col_data = list(data[:, col])
            most_common = Counter(col_data).most_common(1)[0][0]
            data[:, col] = [x if x != -1 else most_common for x in col_data]

    return data, onehot_features


def evaluate(clf):
    print("Cross validation", cross_val_score(clf, X, y))
    if hasattr(clf, "decision_trees"):
        counter = Counter([t.tree_.feature[0] for t in clf.decision_trees])
        first_splits = [
            (features[term[0]], term[1]) for term in counter.most_common()
        ]
        print("First splits", first_splits)
def evaluate_classifier(clf, X, y, features):
    evaluate(clf)
    
    print("Cross-validation scores:", cross_val_score(clf, X, y, cv=5))
    
    if hasattr(clf, "decision_trees"):
        counter = Counter([tree.tree_.feature[0] for tree in clf.decision_trees])
        first_splits = [(features[term[0]], term[1]) for term in counter.most_common()]
        print("First splits using clf.decision_trees:", first_splits)
    else:
        if hasattr(clf, 'tree_'):
            first_split_feature_index = clf.tree_.feature[0]
            if first_split_feature_index != -2: 
                print("First split feature using clf.tree_:", features[first_split_feature_index])
            else:
                print("The tree is a stump.")
        else:
            print("Classifier does not have tree_ attribute. Cannot print first split feature.")


if __name__ == "__main__":
    dataset = "titanic"
    # dataset = "spam"
    params = {
        "max_depth": 5,
        # "random_state": 6,
        "min_samples_leaf": 10,
    }
    N = 100

    if dataset == "titanic":
        # Load titanic data
        path_train = 'datasets/titanic/titanic_training.csv'
        data = genfromtxt(path_train, delimiter=',', dtype=None)
        path_test = 'datasets/titanic/titanic_testing_data.csv'
        test_data = genfromtxt(path_test, delimiter=',', dtype=None)
        y = data[1:, 0]  # label = survived
        class_names = ["Died", "Survived"]

        labeled_idx = np.where(y != b'')[0]
        y = np.array(y[labeled_idx], dtype=float).astype(int)
        print("\n\nPart (b): preprocessing the titanic dataset")
        X, onehot_features = preprocess(data[1:, 1:], onehot_cols=[1, 5, 7, 8])
        X = X[labeled_idx, :]
        Z, _ = preprocess(test_data[1:, :], onehot_cols=[1, 5, 7, 8])
        assert X.shape[1] == Z.shape[1]
        features = list(data[0, 1:]) + onehot_features

    elif dataset == "spam":
        features = [
            "pain", "private", "bank", "money", "drug", "spam", "prescription",
            "creative", "height", "featured", "differ", "width", "other",
            "energy", "business", "message", "volumes", "revision", "path",
            "meter", "memo", "planning", "pleased", "record", "out",
            "semicolon", "dollar", "sharp", "exclamation", "parenthesis",
            "square_bracket", "ampersand"
        ]
        assert len(features) == 32

        # Load spam data
        path_train = 'datasets/spam_data/spam_data.mat'
        data = scipy.io.loadmat(path_train)
        X = data['training_data']
        y = np.squeeze(data['training_labels'])
        Z = data['test_data']
        class_names = ["Ham", "Spam"]

    else:
        raise NotImplementedError("Dataset %s not handled" % dataset)

    print("Features", features)
    print("Train/test size", X.shape, Z.shape)
    
    print("\n\nPart 0: constant classifier")
    print("Accuracy", 1 - np.sum(y) / y.size)

    # sklearn decision tree
    print("\n\nsklearn's decision tree")
    clf = DecisionTreeClassifier(random_state=0, **params)
    clf.fit(X, y)
    evaluate(clf)
    out = io.StringIO()
    export_graphviz(
        clf, out_file=out, feature_names=features, class_names=class_names)
    # For OSX, may need the following for dot: brew install gprof2dot
    graph = graph_from_dot_data(out.getvalue())
    graph_from_dot_data(out.getvalue())[0].write_pdf("%s-tree.pdf" % dataset)
    
    # TODO
    print("\n\nsklearn's decision tree")
    clf = DecisionTreeClassifier(random_state=0, **params)
    clf.fit(X, y)
    evaluate_classifier(clf, X, y, features)
print("done")

ModuleNotFoundError: No module named 'sklearn'

<font size="+2">Titanic</font>

In [7]:
titanic_data_train.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,0.0,3.0,male,,0.0,0.0,359309,8.05,,S
1,1.0,1.0,female,23.0,1.0,0.0,35273,113.275,D36,C
2,0.0,3.0,,38.0,0.0,0.0,SOTON/O.Q. 3101306,7.05,,S
3,0.0,2.0,male,,0.0,0.0,SC/A.3 2861,15.5792,,C
4,0.0,3.0,,,0.0,0.0,349227,7.8958,,S


In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

def preprocess_data(data):
    data['FamilySize'] = data['sibsp'] + data['parch'] + 1
    
    data['IsAlone'] = 1 
    data['IsAlone'].loc[data['FamilySize'] > 1] = 0 
    
    data['FarePerPerson'] = data['fare'] / data['FamilySize']
    
    data = data.drop(['sibsp', 'parch', 'ticket'], axis=1)

    numerical_features = ['age', 'fare']
    categorical_features = ['sex', 'embarked']
    
    numerical_transformer = SimpleImputer(strategy='median')
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    X = data.drop('survived', axis=1)
    y = data['survived'].copy()
    
    X_transformed = preprocessor.fit_transform(X)
    y = y.astype(int)  
    
    return X_transformed, y, preprocessor

X, y, preprocessor = preprocess_data(titanic_data_train)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)

rf_clf = RandomForestClassifier(random_state=42, n_estimators=100)
rf_clf.fit(X_train, y_train)

def evaluate_model(clf, X_train, X_val, y_train, y_val):
    train_preds = clf.predict(X_train)
    val_preds = clf.predict(X_val)
    train_acc = accuracy_score(y_train, train_preds)
    val_acc = accuracy_score(y_val, val_preds)
    print(f"Training Accuracy: {train_acc}, Validation Accuracy: {val_acc}")

print("Decision Tree:")
evaluate_model(dt_clf, X_train, X_val, y_train, y_val)

print("Random Forest:")
evaluate_model(rf_clf, X_train, X_val, y_train, y_val)

Decision Tree:
Training Accuracy: 0.9739776951672863, Validation Accuracy: 0.7673267326732673
Random Forest:
Training Accuracy: 0.9739776951672863, Validation Accuracy: 0.801980198019802


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['IsAlone'].loc[data['FamilySize'] > 1] = 0


In [48]:
from sklearn.model_selection import GridSearchCV
param_grid_rf = {
    'n_estimators': [100, 200, 300],  
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]  
}

grid_search_rf = GridSearchCV(estimator=RandomForestClassifier(random_state=42), 
                              param_grid=param_grid_rf, 
                              cv=5, 
                              scoring='accuracy', 
                              n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

print("Best hyperparameters for Random Forest:", grid_search_rf.best_params_)

Best hyperparameters for Random Forest: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}


In [51]:
rf_clf_optimized = grid_search_rf.best_estimator_
# print("Optimized Decision Tree:")
# evaluate_model(dt_clf_optimized, X_train, X_val, y_train, y_val)

print("Optimized Random Forest:")
evaluate_model(rf_clf_optimized, X_train, X_val, y_train, y_val)

Optimized Random Forest:
Training Accuracy: 0.8847583643122676, Validation Accuracy: 0.8267326732673267


In [52]:
rf_optimized = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_leaf=1,
    min_samples_split=10,
    max_features='sqrt',
    random_state=42
)

rf_optimized.fit(X_train, y_train)

val_preds = rf_optimized.predict(X_val)
val_accuracy = accuracy_score(y_val, val_preds)
print(f"Validation Accuracy of Optimized Model: {val_accuracy}")


Validation Accuracy of Optimized Model: 0.8267326732673267


In [55]:
X_test_transformed = preprocessor.transform(titanic_data_test)
predictions_titanic = rf_optimized.predict(X_test_transformed)
def results_to_csv(y_test):
    y_test = y_test.astype(int)
    df = pd.DataFrame({'Category': y_test})
    df.index += 1 # Ensures that the index starts at 1
    df.to_csv('submission_rf_titanic.csv', index_label='Id')
results_to_csv(predictions_titanic)
print("done")

done


In [39]:
X_test_transformed = preprocessor.transform(titanic_data_test)
print("done")

done


In [40]:
predictions_rf = rf_clf.predict(X_test_transformed)
print("done")

done


In [41]:
predictions_dt = dt_clf.predict(X_test_transformed)
print("done")

done


In [42]:
def results_to_csv(y_test):
    y_test = y_test.astype(int)
    df = pd.DataFrame({'Category': y_test})
    df.index += 1 # Ensures that the index starts at 1
    df.to_csv('submission_decision_tree.csv', index_label='Id')
results_to_csv(predictions_dt)



In [28]:
spam_data = scipy.io.loadmat('datasets/spam_data/spam_data.mat')

In [29]:
spam_data

{'__header__': b'MATLAB 5.0 MAT-file Platform: posix, Created on: Sun Mar  3 21:01:43 2024',
 '__version__': '1.0',
 '__globals__': [],
 'training_data': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 2., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.]]),
 'test_data': array([[0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 2., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 3.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 'training_labels': array([[1., 0., 0., ..., 1., 1., 0.]])}

In [30]:
test_data = spam_data['test_data']

training_labels = spam_data['training_labels'].reshape(-1)

In [31]:
features = [
            "pain", "private", "bank", "money", "drug", "spam", "prescription",
            "creative", "height", "featured", "differ", "width", "other",
            "energy", "business", "message", "volumes", "revision", "path",
            "meter", "memo", "planning", "pleased", "record", "out",
            "semicolon", "dollar", "sharp", "exclamation", "parenthesis",
            "square_bracket", "ampersand"
        ]
assert len(features) == 32

# Load spam data
path_train = 'datasets/spam_data/spam_data.mat'
data = scipy.io.loadmat(path_train)
X = data['training_data']
y = np.squeeze(data['training_labels'])
Z = data['test_data']

print("done")

done


In [32]:
from sklearn.model_selection import train_test_split
import scipy.io
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)

rf_clf = RandomForestClassifier(random_state=42, n_estimators=100)
rf_clf.fit(X_train, y_train)

def evaluate_model(clf, X_train, X_val, y_train, y_val):
    train_preds = clf.predict(X_train)
    val_preds = clf.predict(X_val)
    train_acc = accuracy_score(y_train, train_preds)
    val_acc = accuracy_score(y_val, val_preds)
    print(f"Model: {clf.__class__.__name__}\nTraining Accuracy: {train_acc}\nValidation Accuracy: {val_acc}\n")

print("Decision Tree:")
evaluate_model(dt_clf, X_train, X_val, y_train, y_val)

print("Random Forest:")
evaluate_model(rf_clf, X_train, X_val, y_train, y_val)

Decision Tree:
Model: DecisionTreeClassifier
Training Accuracy: 0.9011769931157007
Validation Accuracy: 0.80550621669627

Random Forest:
Model: RandomForestClassifier
Training Accuracy: 0.9011769931157007
Validation Accuracy: 0.8259325044404974



In [35]:
predictions_rf_spam = rf_clf.predict(Z)
print("Predictions from Random Forest done.")

predictions_dt_spam = dt_clf.predict(Z)
print("Predictions from Decision Tree done.")

Predictions from Random Forest done.
Predictions from Decision Tree done.


In [37]:
def results_to_csv(y_test):
    y_test = y_test.astype(int)
    df = pd.DataFrame({'Category': y_test})
    df.index += 1 # Ensures that the index starts at 1
    df.to_csv('submission_spam_decsion_tree.csv', index_label='Id')
results_to_csv(predictions_dt_spam)