In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import seaborn as sns
from scipy.stats import mode
import time

In [2]:
# read data
predictions = pd.read_csv('/Users/huiyisang/Desktop/gender_submission.csv')
train = pd.read_csv('/Users/huiyisang/Desktop/train.csv')
test = pd.read_csv('/Users/huiyisang/Desktop/test.csv')

In [3]:
data = sns.load_dataset('titanic')

In [4]:
df = data.drop(['deck','class','who','adult_male','embark_town','alive','alone'],axis=1)

In [5]:
df['age'] = df['age'].fillna(df.groupby('sex')['age'].transform('mean'))

In [6]:
df['fam'] = df['parch'] + df['sibsp']

In [7]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,fam
0,0,3,male,22.0,1,0,7.25,S,1
1,1,1,female,38.0,1,0,71.2833,C,1
2,1,3,female,26.0,0,0,7.925,S,0
3,1,1,female,35.0,1,0,53.1,S,1
4,0,3,male,35.0,0,0,8.05,S,0


In [8]:
df = pd.get_dummies(df, columns=['sex','embarked','pclass'], dtype=int)

In [9]:
y = df['survived']
X = df.drop('survived', axis=1)

In [10]:
all_columns = df.columns.to_list()
random_columns = [np.random.choice(all_columns, 3, replace= True).tolist() for i in range (10)]

In [11]:
random_columns

[['sibsp', 'fam', 'survived'],
 ['pclass_2', 'embarked_C', 'embarked_C'],
 ['fare', 'sibsp', 'sex_male'],
 ['age', 'pclass_2', 'parch'],
 ['age', 'sibsp', 'survived'],
 ['sibsp', 'sex_female', 'sex_female'],
 ['embarked_S', 'survived', 'sibsp'],
 ['sex_male', 'sibsp', 'pclass_2'],
 ['pclass_3', 'pclass_1', 'age'],
 ['pclass_2', 'pclass_2', 'embarked_Q']]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
#customize the ensamble function
def build_decision_trees(X_train, y_train, num_models=10, num_columns=3):
    all_columns = X_train.columns.to_list()
    models = []
    for i in range(num_models):
        selected_columns = np.random.choice(all_columns, num_columns, replace=True).tolist()
        X_train_subset = X_train[selected_columns]
        tree = DecisionTreeClassifier(random_state=42)
        tree.fit(X_train_subset, y_train)
        models.append((tree, selected_columns))
    return models

In [14]:
models = build_decision_trees(X_train, y_train, num_models=10, num_columns=3)

In [15]:
models

[(DecisionTreeClassifier(random_state=42), ['age', 'fam', 'age']),
 (DecisionTreeClassifier(random_state=42), ['fam', 'pclass_3', 'sex_female']),
 (DecisionTreeClassifier(random_state=42),
  ['embarked_S', 'embarked_S', 'sex_female']),
 (DecisionTreeClassifier(random_state=42), ['fare', 'sex_female', 'age']),
 (DecisionTreeClassifier(random_state=42), ['age', 'pclass_2', 'embarked_C']),
 (DecisionTreeClassifier(random_state=42),
  ['embarked_S', 'age', 'sex_female']),
 (DecisionTreeClassifier(random_state=42),
  ['embarked_Q', 'embarked_C', 'parch']),
 (DecisionTreeClassifier(random_state=42), ['pclass_2', 'fare', 'fam']),
 (DecisionTreeClassifier(random_state=42), ['parch', 'fam', 'pclass_2']),
 (DecisionTreeClassifier(random_state=42),
  ['sibsp', 'sex_female', 'pclass_2'])]

In [16]:
def measure_accuracy(models, X_train, y_train):
    """
    Measure accuracy of the models on the training data.
    
    Parameters:
    - models: List of models and their selected columns.
    - X_train: Training feature data.
    - y_train: True training labels.
    
    Returns:
    - Overall accuracy of the ensemble model.
    """
    # Store predictions for each model
    predictions = []
    
    for tree, selected_columns in models:
        # Use the selected columns to predict on the training data
        X_train_subset = X_train[selected_columns]
        pred = tree.predict(X_train_subset)
        predictions.append(pred)
    
    # Transpose predictions to get predictions for each sample across all models
    predictions = np.array(predictions).T
    
    # Majority vote across the 10 models for each sample
    final_predictions, _ = mode(predictions, axis=1)
    final_predictions = final_predictions.flatten()  # Convert to 1D array
    
    # Measure accuracy
    accuracy = accuracy_score(y_train, final_predictions)
    return accuracy

# Measure accuracy on the training data
train_accuracy = measure_accuracy(models, X_train, y_train)
print(f"Training Accuracy: {train_accuracy:.2f}")

Training Accuracy: 0.86


In [17]:
def majority_vote_prediction(models, X_test):
    """
    Apply the trained models on the test data and perform majority vote for final prediction.
    
    Parameters:
    - models: List of trained models and their selected columns.
    - X_test: Test feature data.
    
    Returns:
    - Final predictions based on majority voting.
    """
    # Store predictions for each model
    predictions = []
    
    for tree, selected_columns in models:
        # Use the selected columns to predict on the test data
        X_test_subset = X_test[selected_columns]
        pred = tree.predict(X_test_subset)
        predictions.append(pred)
    
    # Transpose predictions to get predictions for each sample across all models
    predictions = np.array(predictions).T
    
    # Majority vote across the models for each test sample
    final_predictions, _ = mode(predictions, axis=1)
    final_predictions = final_predictions.flatten()  # Convert to 1D array
    
    return final_predictions

# Make predictions on the test data
final_test_predictions = majority_vote_prediction(models, X_test)

# Measure accuracy on the test data
test_accuracy = accuracy_score(y_test, final_test_predictions)
print(f"Test Accuracy: {test_accuracy:.2f}")

Test Accuracy: 0.77
