In [1]:
import pandas as pd
import seaborn as sns

In [2]:
df = sns.load_dataset('titanic')

In [3]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
df.shape


(891, 15)

In [5]:
df.drop(['class','who','embark_town','alive','alone'],axis=1,inplace=True)

In [6]:
df.shape

(891, 10)

In [7]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,adult_male,deck
0,0,3,male,22.0,1,0,7.25,S,True,
1,1,1,female,38.0,1,0,71.2833,C,False,C
2,1,3,female,26.0,0,0,7.925,S,False,
3,1,1,female,35.0,1,0,53.1,S,False,C
4,0,3,male,35.0,0,0,8.05,S,True,


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   survived    891 non-null    int64   
 1   pclass      891 non-null    int64   
 2   sex         891 non-null    object  
 3   age         714 non-null    float64 
 4   sibsp       891 non-null    int64   
 5   parch       891 non-null    int64   
 6   fare        891 non-null    float64 
 7   embarked    889 non-null    object  
 8   adult_male  891 non-null    bool    
 9   deck        203 non-null    category
dtypes: bool(1), category(1), float64(2), int64(4), object(2)
memory usage: 57.9+ KB


In [9]:
df['age'].fillna(df['age'].median(),inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].median(),inplace=True)


In [10]:
df['embarked'].fillna(df['embarked'].value_counts().idxmax(),inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['embarked'].fillna(df['embarked'].value_counts().idxmax(),inplace=True)


In [11]:
# Encode categorical variables
df['sex'] = df['sex'].map({'male': 0, 'female': 1})

df = pd.get_dummies(df, columns=['deck', 'embarked'], drop_first=True, dummy_na=True)

df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,adult_male,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,deck_nan,embarked_Q,embarked_S,embarked_nan
0,0,3,0,22.0,1,0,7.25,True,False,False,False,False,False,False,True,False,True,False
1,1,1,1,38.0,1,0,71.2833,False,False,True,False,False,False,False,False,False,False,False
2,1,3,1,26.0,0,0,7.925,False,False,False,False,False,False,False,True,False,True,False
3,1,1,1,35.0,1,0,53.1,False,False,True,False,False,False,False,False,False,True,False
4,0,3,0,35.0,0,0,8.05,True,False,False,False,False,False,False,True,False,True,False


In [12]:
# Feature engineering: log transform for skewed fare
import numpy as np
df['Fare_log'] = np.log1p(df['fare'])

# Drop raw fare column
df.drop('fare', axis=1, inplace=True)


In [13]:
# Split features and target
X = df.drop('survived', axis=1)
y = df['survived']



In [14]:
# Train-test split
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2 ,random_state=42)




In [15]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)




In [16]:
# Train logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)


# Evaluate initial model
y_pred = model.predict(X_test_scaled)
print("Initial Accuracy:", accuracy_score(y_test, y_pred))


Initial Accuracy: 0.8100558659217877


In [17]:
# Hyperparameter tuning
param_grid = { 
    'C': [1000, 100, 10, 1, 0.1, 0.01],
    'penalty': ['l1', 'l2'],
}

grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid, cv=10)
grid.fit(X_train_scaled, y_train)

# Evaluate best model
best_model = grid.best_estimator_
y_pred_best = best_model.predict(X_test_scaled)
print("Best Accuracy after GridSearchCV:", accuracy_score(y_test, y_pred_best))


Best Accuracy after GridSearchCV: 0.8100558659217877
