In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


In [30]:
# Load the Titanic dataset
df = pd.read_csv("Titanic.csv")

In [31]:
df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Survived
0,1,3,Allison Hill,male,17,4,2,43d75413-a939-4bd1-a516-b0d47d3572cc,144.08,Q,1
1,2,1,Noah Rhodes,male,60,2,2,6334fa2a-8b4b-47e7-a451-5ae01754bf08,249.04,S,0
2,3,3,Angie Henderson,male,64,0,0,61a66444-e2af-4629-9efb-336e2f546033,50.31,Q,1
3,4,3,Daniel Wagner,male,35,4,0,0b6c03c8-721e-4419-afc3-e6495e911b91,235.2,C,1
4,5,1,Cristian Santos,female,70,0,3,436e3c49-770e-49db-b092-d40143675d58,160.17,C,1


In [32]:
# Preprocessing
# Drop irrelevant columns
df = df.drop(['PassengerId', 'Name', 'Ticket'], axis=1)

In [33]:
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,3,male,17,4,2,144.08,Q,1
1,1,male,60,2,2,249.04,S,0
2,3,male,64,0,0,50.31,Q,1
3,3,male,35,4,0,235.2,C,1
4,1,female,70,0,3,160.17,C,1


In [34]:
# Handle missing values
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


In [35]:
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,3,male,17,4,2,144.08,Q,1
1,1,male,60,2,2,249.04,S,0
2,3,male,64,0,0,50.31,Q,1
3,3,male,35,4,0,235.2,C,1
4,1,female,70,0,3,160.17,C,1


In [36]:
# Separate features and target
X = df.drop('Survived', axis=1)
y = df['Survived']

In [37]:
# Encode categorical variables
categorical_features = ['Sex', 'Embarked']
numerical_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [38]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [39]:
# Train the model WITHOUT RFE
# Create a pipeline for preprocessing and modeling
model_without_rfe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

In [40]:
# Fit the model
model_without_rfe.fit(X_train, y_train)

In [41]:
# Evaluate the model
y_pred_without_rfe = model_without_rfe.predict(X_test)
accuracy_without_rfe = accuracy_score(y_test, y_pred_without_rfe)
print("Model WITHOUT RFE:")
print(f"Accuracy: {accuracy_without_rfe:.2f}")
print(classification_report(y_test, y_pred_without_rfe))

Model WITHOUT RFE:
Accuracy: 0.56
              precision    recall  f1-score   support

           0       0.58      0.59      0.59       106
           1       0.53      0.52      0.53        94

    accuracy                           0.56       200
   macro avg       0.56      0.56      0.56       200
weighted avg       0.56      0.56      0.56       200



In [42]:
# Train the model WITH RFE
# Create a pipeline for preprocessing, RFE, and modeling
model_with_rfe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('rfe', RFE(LogisticRegression(max_iter=1000), n_features_to_select=5)),  # Select top 5 features
    ('classifier', LogisticRegression(max_iter=1000))
])

In [43]:
# Fit the model
model_with_rfe.fit(X_train, y_train)

In [44]:
# Evaluate the model
y_pred_with_rfe = model_with_rfe.predict(X_test)
accuracy_with_rfe = accuracy_score(y_test, y_pred_with_rfe)
print("Model WITH RFE:")
print(f"Accuracy: {accuracy_with_rfe:.2f}")
print(classification_report(y_test, y_pred_with_rfe))

Model WITH RFE:
Accuracy: 0.56
              precision    recall  f1-score   support

           0       0.58      0.58      0.58       106
           1       0.53      0.53      0.53        94

    accuracy                           0.56       200
   macro avg       0.56      0.56      0.56       200
weighted avg       0.56      0.56      0.56       200



In [45]:
# Display selected features
selected_features = []
for i, feature in enumerate(X.columns):
    if model_with_rfe.named_steps['rfe'].support_[i]:
        selected_features.append(feature)
print("Selected Features by RFE:", selected_features)

Selected Features by RFE: ['Sex', 'Age', 'SibSp']
