In [None]:
# Load the libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Load the data
train_data = pd.read_csv("/content/train.csv")
test_data = pd.read_csv("/content/test.csv")

In [None]:
# Engineer the 'FamilySize' feature
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1

In [None]:
# Deriving gender from name
train_data['Title'] = train_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test_data['Title'] = test_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [None]:
# Extract last names
train_data['LastName'] = train_data['Name'].apply(lambda x: x.split(',')[0])
test_data['LastName'] = test_data['Name'].apply(lambda x: x.split(',')[0])

In [None]:
# Create a family group feature based on last names and family size
train_data['FamilyGroup'] = train_data['LastName'] + "_" + train_data['FamilySize'].astype(str)
test_data['FamilyGroup'] = test_data['LastName'] + "_" + test_data['FamilySize'].astype(str)

In [None]:
# Impute cabins based on the most common cabin for each family group
family_cabin_mapping = train_data.dropna(subset=['Cabin']).groupby('FamilyGroup')['Cabin'].agg(lambda x:x.value_counts().index[0])
train_data['Cabin'] = train_data.apply(lambda x: family_cabin_mapping.get(x['FamilyGroup'], x['Cabin']), axis=1)
test_data['Cabin'] = test_data.apply(lambda x: family_cabin_mapping.get(x['FamilyGroup'], x['Cabin']), axis=1)

In [None]:
# Select features and target
numerical_features = ['Pclass']
categorical_features = ['Embarked','Title', 'FamilyGroup', 'Cabin']
X = train_data[numerical_features + categorical_features]
y = train_data['Survived']
X_test = test_data[numerical_features + categorical_features]

In [None]:
# Preprocessing
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
# Define the model
model = RandomForestClassifier(n_estimators=200, random_state=0)


In [None]:
# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])

In [None]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

# Preprocessing of training data, fit model
clf.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = clf.predict(X_val)

In [None]:
# Evaluate the model
score = accuracy_score(y_val, preds)
print('Accuracy:', score)

Accuracy: 0.8156424581005587


In [None]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,Title,LastName,FamilyGroup
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,1,Mr,Kelly,Kelly_1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,2,Mrs,Wilkes,Wilkes_2
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,1,Mr,Myles,Myles_1
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,1,Mr,Wirz,Wirz_1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,3,Mrs,Hirvonen,Hirvonen_3


In [None]:
# Preprocess the test dataset and make predictions
test_preds = clf.predict(X_test)

In [None]:
# Create a DataFrame with the PassengerId as a column from the test dataset
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': test_preds
})

# Create the submission file
submission.to_csv('titanic_submission.csv', index=False)

print("Your submission was successfully saved!")

Your submission was successfully saved!


In [None]:
# 888