In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
data = pd.concat([train_data, test_data], sort=False)


In [4]:
print(train_data.columns)
print(test_data.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [5]:
print(train_data.shape)
print(train_data.count())
print(train_data.columns)
train_data.head()

(891, 12)
PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
#    return tt
    return(np.transpose(tt))

In [9]:
def most_frequent_values(data):
    total = data.count()
    tt = pd.DataFrame(total)
    tt.columns = ['Total']
    items = []
    vals = []
    for col in data.columns:
        try:
            itm = data[col].value_counts().index[0]
            val = data[col].value_counts().values[0]
            items.append(itm)
            vals.append(val)
        except Exception as ex:
            print(ex)
            items.append(0)
            vals.append(0)
            continue
    tt['Most frequent item'] = items
    tt['Frequence'] = vals
    tt['Percent from total'] = np.round(vals / total * 100, 3)
    return(np.transpose(tt))


In [10]:
def unique_values(data):
    total = data.count()
    tt = pd.DataFrame(total)
    tt.columns = ['Total']
    uniques = []
    for col in data.columns:
        unique = data[col].nunique()
        uniques.append(unique)
    tt['Uniques'] = uniques
    return(np.transpose(tt))

In [11]:
missing_data(data)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Total,0,418,0,0,0,263,0,0,0,1,1014,2
Percent,0.0,31.932773,0.0,0.0,0.0,20.091673,0.0,0.0,0.0,0.076394,77.463713,0.152788
Types,int64,float64,int64,object,object,float64,int64,int64,object,float64,object,object


In [12]:
most_frequent_values(data)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Total,1309.0,891.0,1309.0,1309,1309,1046.0,1309.0,1309.0,1309,1308.0,295,1307
Most frequent item,1.0,0.0,3.0,"Connolly, Miss. Kate",male,24.0,0.0,0.0,CA. 2343,8.05,C23 C25 C27,S
Frequence,1.0,549.0,709.0,2,843,47.0,891.0,1002.0,11,60.0,6,914
Percent from total,0.076,61.616,54.163,0.153,64.4,4.493,68.067,76.547,0.84,4.587,2.034,69.931


In [13]:
unique_values(data)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Total,1309,891,1309,1309,1309,1046,1309,1309,1309,1308,295,1307
Uniques,1309,2,3,1307,2,98,7,8,929,281,186,3


In [14]:
grouped = data.groupby(['Pclass', 'Sex'])['Age']
data['Age'] = grouped.apply(lambda x: x.fillna(x.median())).values


In [15]:
data['Embarked'] = data['Embarked'].fillna(data['Embarked'].mode()[0])


In [16]:
data['Fare'] = data['Fare'].fillna(data.groupby('Pclass')['Fare'].transform('median'))
#data['Fare'] = data.groupby(['Pclass'])['Fare'].apply(lambda x: x.fillna(x.median()))


In [17]:
data['FamilySize'] = data['SibSp'] + data['Parch'] + 1


In [18]:
data['Name'].head()

Unnamed: 0,Name
0,"Braund, Mr. Owen Harris"
1,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,"Heikkinen, Miss. Laina"
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,"Allen, Mr. William Henry"


In [19]:
data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
rare_titles = data['Title'].value_counts()[data['Title'].value_counts() < 10].index
data['Title'] = data['Title'].replace(rare_titles, 'Rare')
data['Title'] = data['Title'].replace({'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'})
data['Title'].value_counts()

Unnamed: 0_level_0,count
Title,Unnamed: 1_level_1
Mr,757
Miss,260
Mrs,197
Master,61
Rare,34


In [20]:
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})
data['Embarked'] = data['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
data['Title'] = data['Title'].map({'Mr': 0, 'Miss': 1, 'Mrs': 2, 'Master': 3, 'Rare': 4})

In [21]:
data = data.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis=1)


In [22]:
train = data[:len(train_data)]
test = data[len(train_data):]

In [23]:
X = train.drop('Survived', axis=1)
y = train['Survived']

In [24]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [25]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)


In [26]:
y_pred = clf.predict(X_val)


In [27]:
print(accuracy_score(y_val, y_pred))

0.8268156424581006


In [28]:
final_predictions_rdf_1 = clf.predict(test.drop('Survived', axis=1))


In [29]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [4, 6, 8],
    'min_samples_split': [2, 5, 10]
}


In [30]:
grid_clf = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid, cv=3, scoring='accuracy')
grid_clf.fit(X_train, y_train)

In [31]:
best_model = grid_clf.best_estimator_
print("Best Parameters:", grid_clf.best_params_)

Best Parameters: {'max_depth': 6, 'min_samples_split': 5, 'n_estimators': 200}


In [32]:
final_predictions_rdf_1_gscv = best_model.predict(test.drop('Survived', axis=1))


In [33]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)
X_val_norm = scaler.transform(X_val)
test_norm = scaler.transform(test.drop('Survived', axis=1))


In [34]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.utils import to_categorical
import tensorflow.keras.backend as K
K.clear_session()

In [35]:
nin = X_train.shape[1]
nh = 128
nout = 1
model = Sequential()
model.add(Input(shape=(nin,)))
model.add(Dense(units=nh, activation='relu', name='hidden'))
model.add(Dense(units=nout, activation='sigmoid', name='output'))

In [36]:
model.summary()

In [37]:
from tensorflow.keras import optimizers

opt = optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt,
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [38]:
hist = model.fit(X_train_norm, y_train, epochs=100, batch_size=24, validation_data=(X_val_norm,y_val))

Epoch 1/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.7298 - loss: 0.5283 - val_accuracy: 0.7821 - val_loss: 0.4526
Epoch 2/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8124 - loss: 0.4241 - val_accuracy: 0.7877 - val_loss: 0.4512
Epoch 3/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8293 - loss: 0.3996 - val_accuracy: 0.7821 - val_loss: 0.4385
Epoch 4/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8213 - loss: 0.3965 - val_accuracy: 0.8156 - val_loss: 0.4579
Epoch 5/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8241 - loss: 0.4372 - val_accuracy: 0.7877 - val_loss: 0.4766
Epoch 6/100
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8397 - loss: 0.3883 - val_accuracy: 0.8156 - val_loss: 0.4547
Epoch 7/100
[1m30/30[0m [32m━━

In [39]:
val_loss, val_accuracy = model.evaluate(X_val_norm, y_val, verbose=0)
print(f"Neural Network Validation Accuracy: {val_accuracy:.4f}")


Neural Network Validation Accuracy: 0.7877


In [40]:
final_predictions_nn = model.predict(test_norm)
final_predictions_nn = (final_predictions_nn > 0.5).astype(int).flatten()

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


In [41]:
final_predictions_nn

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

# Now let's run Multiple models

In [42]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)


In [43]:
model_grids = {
    'Logistic Regression': {
        'model': LogisticRegression(max_iter=1000),
        'params': {'C': [0.1, 1, 10], 'penalty': ['l2']}
    },
    'Decision Tree': {
        'model': DecisionTreeClassifier(),
        'params': {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10]}
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5]}
    },
    'Gradient Boosting': {
        'model': GradientBoostingClassifier(random_state=42),
        'params': {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 5]}
    },
    'AdaBoost': {
        'model': AdaBoostClassifier(random_state=42),
        'params': {'n_estimators': [50, 100], 'learning_rate': [0.01, 0.1, 1]}
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'params': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}
    },
    'SVM': {
        'model': SVC(probability=True),
        'params': {'C': [0.1,10], 'kernel': ['rbf']}
    }
}

model_results = {}

In [44]:
for name, cfg in model_grids.items():
    print(f"Tuning {name}...")
    grid_search = GridSearchCV(cfg['model'], cfg['params'], cv=kfold, scoring='accuracy', n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    scores = cross_val_score(best_model, X_train, y_train, cv=kfold, scoring='accuracy')
    n_folds = kfold.get_n_splits()
    model_results[name] = {
        'Best Params': grid_search.best_params_,
        'Mean Accuracy': scores.mean(),
        'Std Dev': scores.std(),
        'SE': scores.std(ddof=1) / np.sqrt(n_folds - 1)

    }

Tuning Logistic Regression...
Fitting 10 folds for each of 3 candidates, totalling 30 fits
Tuning Decision Tree...
Fitting 10 folds for each of 12 candidates, totalling 120 fits
Tuning Random Forest...
Fitting 10 folds for each of 18 candidates, totalling 180 fits
Tuning Gradient Boosting...
Fitting 10 folds for each of 8 candidates, totalling 80 fits
Tuning AdaBoost...
Fitting 10 folds for each of 6 candidates, totalling 60 fits
Tuning KNN...
Fitting 10 folds for each of 6 candidates, totalling 60 fits
Tuning SVM...
Fitting 10 folds for each of 2 candidates, totalling 20 fits


In [45]:
model_results['Neural Network'] = {
    'Best Params': 'N/A',
    'Mean Accuracy': val_accuracy,
    'Std Dev': 'N/A'
}

In [46]:
# Display Results
results_df = pd.DataFrame([{**{'Model': key}, **value} for key, value in model_results.items()])
#results_df = results_df.sort_values(by='Mean Accuracy', ascending=False)
#print("\nModel Performance:")
results_df

Unnamed: 0,Model,Best Params,Mean Accuracy,Std Dev,SE
0,Logistic Regression,"{'C': 0.1, 'penalty': 'l2'}",0.804773,0.047217,0.01659
1,Decision Tree,"{'max_depth': 10, 'min_samples_split': 10}",0.796401,0.044371,0.01559
2,Random Forest,"{'max_depth': 10, 'min_samples_split': 5, 'n_e...",0.834311,0.038389,0.013488
3,Gradient Boosting,"{'learning_rate': 0.01, 'max_depth': 5, 'n_est...",0.835603,0.035357,0.012423
4,AdaBoost,"{'learning_rate': 1, 'n_estimators': 50}",0.81741,0.041584,0.014611
5,KNN,"{'n_neighbors': 5, 'weights': 'uniform'}",0.689828,0.055822,0.019614
6,SVM,"{'C': 10, 'kernel': 'rbf'}",0.734644,0.051157,0.017975
7,Neural Network,,0.787709,,


In [47]:
# One SE Rule
mean_acc = results_df['Mean Accuracy']

max_mean_acc_idx = np.argmax(mean_acc)

max_mean_acc_se = results_df['SE'][max_mean_acc_idx]

threshold = mean_acc[max_mean_acc_idx] - max_mean_acc_se

best_acc_indices = np.where(results_df['Mean Accuracy'] >= threshold)[0]

best_index = np.min(best_acc_indices)

best_mean_accuracy = results_df['Mean Accuracy'][best_index]

print("Best Mean Accuracy:", best_mean_accuracy)

best_model_name = results_df['Model'][best_index]
best_model_name

Best Mean Accuracy: 0.8343114241001566


'Random Forest'

In [52]:
best_model = model_grids[best_model_name]['model']
best_model.set_params(**model_results[best_model_name]['Best Params'])
best_model.fit(X_train, y_train)
final_predictions_best_model = best_model.predict(test.drop('Survived', axis=1)).astype(int)
len(final_predictions_best_model)

418

In [55]:
final_predictions_best_model


array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [57]:
# Prepare submission
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': final_predictions_best_model
})
submission.to_csv('titanic_submission.csv', index=False)
print("Submission file created.")


Submission file created.
