# Libray Import

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split, KFold,cross_val_score

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, classification_report

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Read Dataset

In [None]:
data=pd.read_csv('datasets/titanic.csv')

# Information

In [None]:
data.head(5)

In [None]:
data.sample(5)

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.shape

In [None]:
data.isnull()
data.isnull().sum()

# Visualization

In [None]:
# sns.pairplot(data)

In [None]:
sns.scatterplot(x=data.age,y=data.fare,hue=data.embarked,style=data['alive'])

In [None]:
sns.boxplot(x=data.alive,y=data.age,hue=data.embarked)

# Preprocessing

## Column Drop

In [None]:
data.head(5)

In [None]:
data['who'].unique()

In [None]:
cols=['survived','pclass','sex','adult','sibsp','parch','deck','town']

data.drop(cols,axis=1,inplace=True)
data.head(5)

## Null Handle

In [None]:
data.isnull().sum()

In [None]:
data['age']=data['age'].fillna(data['age'].median())
data['embarked']=data['embarked'].fillna(data['embarked'].mode()[0])

data.isnull().sum()

## Feature Target

In [None]:
x=data.drop(['alive'],axis=1)
y=data['alive']

In [None]:
x.head(5)

In [None]:
y.head(5)

## Target varibale (Label Encoder) Before Split

In [None]:
le=LabelEncoder()
# le.fit(y)
# y=le.transform(y)
y=le.fit_transform(y)
y.shape

## Train-Test-Split

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
print(xtrain.shape)
print(xtest.shape)
print(ytrain.shape)
print(ytest.shape)

In [None]:
xtrain.head(3)

## Target varibale (Label Encoder) After Split

In [None]:
# le=LabelEncoder()
# le.fit(ytrain)
# ytrain=le.transform(ytrain)
# ytest=le.transform(ytest)

## Ordinal Data (Ordinal Encoder)

In [None]:
cols=['class']
cat=['Third','Second','First']

oe=OrdinalEncoder(categories=[cat])
oe.fit(xtrain[cols])
xtrain[cols]=oe.transform(xtrain[cols])
xtest[cols]=oe.transform(xtest[cols])

In [None]:
xtrain.head(5)

## Nominal Data (oneHotEncoding)

In [None]:
cols=['embarked','who','alone']

ohe=OneHotEncoder(drop='first',sparse_output=False)
ohe.fit(xtrain[cols])

new_col=ohe.get_feature_names_out(cols)
print(new_col)

xtrain[new_col]=ohe.transform(xtrain[cols])
xtest[new_col]=ohe.transform(xtest[cols])

xtrain.drop(cols,axis=1,inplace=True)
xtest.drop(cols,axis=1,inplace=True)

In [None]:
xtrain.head(3)

## Scaling

In [None]:
cols=['age','fare']
sc=StandardScaler()
sc.fit(xtrain[cols])
xtrain[cols]=sc.transform(xtrain[cols])
xtest[cols]=sc.transform(xtest[cols])

# Model

In [None]:
model=LogisticRegression()
model.fit(xtrain,ytrain)
ypred=model.predict(xtest)

acc=accuracy_score(ytest,ypred)
acc

# Cross validation

In [None]:
ocols = ['class']
ncols = ['who', 'alone']
xcols = ['age']  # numeric
mcols = ['embarked']  # categorical with missing values

cat = ['Third', 'Second', 'First']

pip = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', sparse_output=False))
])

tfx = ColumnTransformer(transformers=[
    ('oe', OrdinalEncoder(categories=[cat]), ocols),
    ('ohe', OneHotEncoder(drop='first', sparse_output=False), ncols),
    ('num_imputer', SimpleImputer(strategy='mean'), xcols),
    ('embarked_pipe', pip, mcols)
], remainder='passthrough')

In [None]:
# Full pipeline including model
kpipe = Pipeline([
    ('transform', tfx),
    ('model', LogisticRegression(max_iter=1000))
])

# Run cross-validation
score = cross_val_score(kpipe, x, y, cv=5)
print(score)
sc = np.average(score)

print(f"Average CV score: {sc:.4f}")