In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from feature_engine.selection import SelectByShuffling


In [3]:
def load_titanic(filepath='titanic.csv'):
    data = pd.read_csv(filepath)
    data = data.replace('?', np.nan)
    data['cabin'] = data['cabin'].astype(str).str[0]
    data['pclass'] = data['pclass'].astype('O')
    data['age'] = data['age'].astype('float').fillna(data.age.median())
    data['fare'] = data['fare'].astype('float').fillna(data.fare.median())
    data['embarked'].fillna('C', inplace=True)
    return data

In [4]:
df = load_titanic("../data/titanic-2/Titanic-Dataset.csv")
df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['embarked'].fillna('C', inplace=True)


Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,n,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,n,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,n,S


In [5]:
X = df.drop(columns=['survived', 'name', 'passengerid', 'ticket'])
X = pd.get_dummies(X, columns=['sex', 'cabin', 'embarked'], drop_first=True)
X = X.fillna(X.median())
y = df['survived']

# Define model
model = LogisticRegression(max_iter=1000)

# Apply feature shuffling
shuffle = SelectByShuffling(estimator=model, scoring='accuracy', cv=3, threshold=0.01)
X_shuffle = shuffle.fit_transform(X, y)
print("Original shape:", X.shape)
print("After SelectByShuffling:", X_shuffle.shape)

  X = X.fillna(X.median())


Original shape: (891, 16)
After SelectByShuffling: (891, 15)
