In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/competitions/titanic/train.csv
/kaggle/input/competitions/titanic/test.csv
/kaggle/input/competitions/titanic/gender_submission.csv


# ****Import Data****

In [2]:
train_data = pd.read_csv("/kaggle/input/competitions/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/competitions/titanic/test.csv")
train_data.info()
train_data.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
class SexAgeCombiner(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.age_bins = [0, 2, 18, 65, 100]
        self.category_names = ['Infant', 'Child', 'Adult', 'Senior']
        self.median_age = None
    def fit(self, X, y=None):
        X_df = pd.DataFrame(X)
        self.median_age = X_df.iloc[:, 1].median() 
        return self
    def transform(self, X):
        X_df = pd.DataFrame(X)
        X_df.iloc[:, 1] = X_df.iloc[:, 1].fillna(self.median_age)
        age_groups = pd.cut(X_df.iloc[:, 1], bins = self.age_bins, labels = self.category_names)
        combined_features = X_df.iloc[:, 0].astype(str) + '_' + age_groups.astype(str)
        return combined_features.values.reshape(-1,1)

In [4]:
class FamilyGroupBinner(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.size_bins = [0, 1, 4, 6, 15]
        self.fam_category_names = ['Single', 'SmallFam', 'MidFam', 'LargeFam']
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_df = pd.DataFrame(X)
        fam_size = X_df.iloc[:, 0] + X_df.iloc[:, 1] + 1
        fam_group = pd.cut(fam_size, bins = self.size_bins, labels = self.fam_category_names)
        return fam_group.values.astype(str).reshape(-1, 1)

In [5]:
class FareBinner(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.fare_bins = [-1, 20, 50, 100, 150, 200, 1000]
        self.fare_cate_names = ['VeryPoor', 'Poor', 'Normal', 'Middle', 'UpperMiddle', 'Rich']
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_df = pd.DataFrame(X)
        fare_cate = pd.cut(X_df.iloc[:, 0], bins=self.fare_bins, labels=self.fare_cate_names)
        return fare_cate.values.astype(str).reshape(-1, 1)

In [6]:
fare_bins = [0, 20, 50, 100, 150, 200, 300]
train_data['fare_cate'] = pd.cut(train_data['Fare'], bins = fare_bins)
print(train_data['fare_cate'].value_counts())
print(train_data[['fare_cate', 'Survived']].groupby('fare_cate').mean())

fare_cate
(0, 20]       500
(20, 50]      216
(50, 100]     107
(100, 150]     24
(200, 300]     17
(150, 200]      9
Name: count, dtype: int64
            Survived
fare_cate           
(0, 20]     0.284000
(20, 50]    0.416667
(50, 100]   0.654206
(100, 150]  0.791667
(150, 200]  0.666667
(200, 300]  0.647059


  print(train_data[['fare_cate', 'Survived']].groupby('fare_cate').mean())


In [7]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

sex_age_pipe = Pipeline(
    steps=[
        ('combiner', SexAgeCombiner()),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

fam_size_pipe = Pipeline(
    steps=[
        ('fam_group', FamilyGroupBinner()),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)

fare_pipe = Pipeline(
    steps=[
        ('fare_group', FareBinner()),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ]
)
numerical_transformer = SimpleImputer(strategy='median')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')) 
])

preprocessor = ColumnTransformer(
    transformers=[
        ('sex_age', sex_age_pipe, ['Sex', 'Age']),
        ('fam', fam_size_pipe, ['Parch', 'SibSp']),
        ('fare', fare_pipe, ['Fare']), 
        ('cat', categorical_transformer, ['Pclass', 'Embarked'])
    ])

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

# 1. Define the 3 voters (We leave the parameters blank so GridSearch can fill them!)
model_1 = RandomForestClassifier(random_state=42)
model_2 = XGBClassifier(random_state=42, eval_metric='logloss')
model_3 = LogisticRegression(max_iter=1000, random_state=42)

# 2. Create the Committee
ensemble_model = VotingClassifier(
    estimators=[
        ('rf', model_1), 
        ('xgb', model_2), 
        ('log_reg', model_3)
    ], 
    voting='hard' 
)

# 3. Put the Committee in your Pipeline
# (Assuming you still have your 'preprocessor' from earlier)
my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('committee', ensemble_model)   
])

# 4. Set up the Grid Search Parameters using the "Double Underscore" rule!
param_grid = {
    # Tune the Random Forest
    'committee__rf__n_estimators': [100, 300],
    'committee__rf__max_depth': [5, 7],
    
    # Tune the XGBoost
    'committee__xgb__learning_rate': [0.05, 0.1],
    'committee__xgb__max_depth': [3, 5],
    
    # Tune the Logistic Regression ('C' is how strictly it draws its lines)
    'committee__log_reg__C': [0.1, 1.0]
}

# 5. Run the Search!
print("Starting the massive Ensemble Grid Search...")
# cv=5 means 5-fold cross-validation
grid_search = GridSearchCV(my_pipeline, param_grid, cv=5, scoring="accuracy", n_jobs=-1)

# 1. The complete shopping list of raw ingredients for the pipeline
features = ['Sex', 'Age', 'Fare', 'Pclass', 'Embarked', 'Parch', 'SibSp']

# 2. Slice your dataframes to contain ONLY those columns
X = train_data[features]
y = train_data["Survived"]

# Do the exact same for the test data
X_test = test_data[features]

# 3. Feed it to the hungry pipeline!
grid_search.fit(X, y)

print(f"Best Ensemble Score: {grid_search.best_score_:.4f}")
print("Best Parameters:", grid_search.best_params_)

Starting the massive Ensemble Grid Search...
Best Ensemble Score: 0.8092
Best Parameters: {'committee__log_reg__C': 1.0, 'committee__rf__max_depth': 5, 'committee__rf__n_estimators': 100, 'committee__xgb__learning_rate': 0.1, 'committee__xgb__max_depth': 5}


In [9]:
from sklearn.model_selection import cross_val_score

# Assuming you have already defined your 'my_pipeline', 'X', and 'y'
print("Running Cross-Validation...")

# cv=5 means 5-fold cross-validation
# n_jobs=-1 makes it run as fast as possible
scores = cross_val_score(my_pipeline, X, y, cv=5, scoring='accuracy', n_jobs=-1)

# Print the results
print("Scores for each of the 5 folds:", scores)
print(f"Average CV Score (Expected Kaggle Score): {scores.mean():.4f}")

Running Cross-Validation...
Scores for each of the 5 folds: [0.78212291 0.80337079 0.83146067 0.82022472 0.80898876]
Average CV Score (Expected Kaggle Score): 0.8092


In [10]:
predictions = grid_search.predict(X_test)
output = pd.DataFrame({"PassengerId": test_data["PassengerId"], "Survived": predictions})
output.to_csv("submission.csv", index=False)
print("Saved Ensemble submission file!")

Saved Ensemble submission file!


In [11]:
train_preds = grid_search.predict(X)

mistakes = train_data.copy()
mistakes['Predicted'] = train_preds
mistakes = mistakes[mistakes['Predicted'] != mistakes['Survived']]

print(f"Total Mistakes: {len(mistakes)} out of {len(train_data)}")

my_tool01 = SexAgeCombiner()
my_tool01.fit(train_data[['Sex', 'Age']])
mistakes['Sex_Age_Group'] = my_tool01.transform(mistakes[['Sex', 'Age']]).flatten()

my_tool02 = FamilyGroupBinner()
my_tool01.fit(train_data[['Parch', 'SibSp']])
mistakes['Fam_Size'] = my_tool02.transform(mistakes[['Parch', 'SibSp']])

# 5. Now you can print your exact list of features!
mistakes[['Name', 'Sex_Age_Group', 'Fare','Pclass', 'Fam_Size', 'Survived', 'Predicted']].head(20)

Total Mistakes: 149 out of 891


Unnamed: 0,Name,Sex_Age_Group,Fare,Pclass,Fam_Size,Survived,Predicted
2,"Heikkinen, Miss. Laina",female_Adult,7.925,3,Single,1,0
14,"Vestrom, Miss. Hulda Amanda Adolfina",female_Child,7.8542,3,Single,0,1
17,"Williams, Mr. Charles Eugene",male_Adult,13.0,2,Single,1,0
18,"Vander Planke, Mrs. Julius (Emelia Maria Vande...",female_Adult,18.0,3,SmallFam,0,1
21,"Beesley, Mr. Lawrence",male_Adult,13.0,2,Single,1,0
23,"Sloper, Mr. William Thompson",male_Adult,35.5,1,Single,1,0
25,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...",female_Adult,31.3875,3,LargeFam,1,0
34,"Meyer, Mr. Edgar Joseph",male_Adult,82.1708,1,SmallFam,0,1
36,"Mamee, Mr. Hanna",male_Adult,7.2292,3,Single,1,0
38,"Vander Planke, Miss. Augusta Maria",female_Child,18.0,3,SmallFam,0,1
