In [1]:
import numpy as np
import pandas as pd

Loading the DataSets

In [12]:
train = pd.read_csv('train[1].csv')
test = pd.read_csv('test[1].csv')
train.shape, test.shape

((891, 12), (418, 11))

In [22]:
# Passengers ids from test data will be using it in final submission
test_passenger_ids = test['PassengerId']
train_survived = train['Survived']


In [19]:
test['Survived'] = -1


In [23]:
combined = pd.concat([train, test], sort = False, ignore_index= True)
combined

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,-1,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1306,-1,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,1307,-1,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,1308,-1,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [24]:
combined.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,263
SibSp,0
Parch,0
Ticket,0
Fare,1


In [25]:
#Dropping the coloumns as cabin has over 1014 missing values and other wont be much help in prediction
combined.drop(['Name', 'Cabin', 'Ticket'], axis=1, inplace=True)


In [26]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [27]:
numeric_features = ['Age', 'Fare']
categorical_features = ['Sex', 'Embarked', 'Pclass', 'SibSp', 'Parch']

In [28]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

In [30]:
# Drop Survived again temporarily
X_combined = combined.drop('Survived', axis=1)

# Recreate splits
X_train = X_combined[:len(train)]
X_test = X_combined[len(train):]
y_train = train_survived


In [32]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])


scores = cross_val_score(model_pipeline, X_train, y_train, cv=5, scoring='accuracy')

print(f"Cross-Validation Accuracy Scores: {scores}")
print(f"Mean Accuracy: {scores.mean():.4f}")


Cross-Validation Accuracy Scores: [0.74301676 0.79775281 0.84831461 0.78089888 0.8258427 ]
Mean Accuracy: 0.7992


In [37]:
model_pipeline.fit(X_train, y_train)

In [38]:
final_predictions = model_pipeline.predict(X_test)

In [41]:
submission = pd.DataFrame({
    'PassengerId': test_passenger_ids,
    'Survived': final_predictions
})


submission.to_csv('submission.csv', index=False)

In [42]:
from google.colab import files
files.download('submission.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>