In [2]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, 
    BaggingClassifier, VotingClassifier , RandomForestRegressor
)
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler



In [3]:
import re

In [4]:
df=pd.read_csv('/kaggle/input/titanic/train.csv')

In [5]:
df['Survived'].value_counts() #predict this target variable 
#display the number of occurrences of each unique value in the 'Survived' column

0    549
1    342
Name: Survived, dtype: int64

In [6]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [7]:
df['Pclass'].value_counts() 
#display the number of occurrences of each unique value in the 'Pclass' column

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [8]:
df.shape #dimensionality of the dataset

(891, 12)

In [9]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [10]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [11]:
df['FamilySize'] = df['SibSp'] + df['Parch']
#calculates a new column 'FamilySize' in the DataFrame df by adding the values of 'SibSp' (number of siblings/spouses) and 'Parch' (number of parents/children).

df = df.drop(columns=['SibSp', 'Parch']) 
#dremoves the 'SibSp' and 'Parch' columns from the DataFrame df, 
#effectively dropping these columns from the dataset.

In [12]:
df = df.drop(columns=['Name', 'Ticket','PassengerId'])
#removes the 'Name', 'Ticket', and 'PassengerId' columns from the DataFrame df, effectively dropping these columns from the dataset.


In [13]:
df['FamilySize'].value_counts()

0     537
1     161
2     102
3      29
5      22
4      15
6      12
10      7
7       6
Name: FamilySize, dtype: int64

In [14]:
deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}#creates a dictionary named deck that maps deck letters (A, B, C, etc.) to numerical values.

df['Cabin'] =df['Cabin'].fillna("U0")#It fills missing values in the 'Cabin' column with "U0" (indicating an unknown or unspecified deck level).

df['Deck'] = df['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())

#This above line extracts the alphabetic characters (deck letters) from each 'Cabin' entry using a regular expression and stores them in a new 'Deck' column.

df['Deck'] = df['Deck'].map(deck)
#It maps the extracted deck letters to their corresponding numerical values using the deck dictionary defined earlier.

df['Deck'] = df['Deck'].fillna(0) #It fills any remaining missing values in the 'Deck' column with 0.
df['Deck'] = df['Deck'].astype(int) #This line converts the 'Deck' column to integers to ensure it contains numerical data.

# we can now drop the cabin feature
df = df.drop(columns=['Cabin'])

In [15]:
features = [ 'Age','Fare', 'Sex','Deck','Pclass', 'FamilySize','Embarked']
#Features that are important and that we will be working on

In [16]:
# Define a pipeline for numerical features, including imputation with mean and standard scaling
num_features = ['Age', 'Fare']
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with mean
    ('scaler', StandardScaler())  # Scale the features using StandardScaler
])


In [17]:
# Defined a pipeline for categorical features, including imputation with the most frequent value and one-hot encoding
cat_features = ['Sex', 'Deck', 'Pclass', 'FamilySize', 'Embarked']
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Imputed missing values with the most frequent value
    ('encoder', OneHotEncoder(handle_unknown='ignore'))  # Performed one-hot encoding, ignoring unknown categories
])


In [18]:
# Defined a ColumnTransformer to apply specific transformations to numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),  # Applied numerical transformation to 'num_features'
        ('cat', cat_transformer, cat_features)  # Appled categorical transformation to 'cat_features'
    ])


In [19]:
# Used the preprocessor to transform the selected features and target variable
X = preprocessor.fit_transform(df[features])  # Transformed the features
y = df['Survived']  # Defined the target variable 'Survived'


In [40]:
# Used the preprocessor to access the 'cat' transformer and 'encoder'
encoder = preprocessor.named_transformers_['cat']['encoder']

# Obtained the encoded feature names and combined them with numerical feature names
encoded_feature_names1 = list(encoder.get_feature_names_out(cat_features))
column_names = num_features + encoded_feature_names1


In [41]:
column_names

['Age',
 'Fare',
 'Sex_female',
 'Sex_male',
 'Deck_0',
 'Deck_1',
 'Deck_2',
 'Deck_3',
 'Deck_4',
 'Deck_5',
 'Deck_6',
 'Deck_7',
 'Deck_8',
 'Pclass_1',
 'Pclass_2',
 'Pclass_3',
 'FamilySize_0',
 'FamilySize_1',
 'FamilySize_2',
 'FamilySize_3',
 'FamilySize_4',
 'FamilySize_5',
 'FamilySize_6',
 'FamilySize_7',
 'FamilySize_10',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [22]:
# Transformed the sparse array 'X' to a DataFrame with column names 'column_names'
X = pd.DataFrame(X.toarray(), columns=column_names)


In [23]:
# Applied random oversampling to address class imbalance
oversampler = RandomOverSampler(random_state=42)

# Resampled the feature and target variables 'X' and 'y'
X_resampled, y_resampled = oversampler.fit_resample(X, y)

In [24]:
# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random


In [25]:
# Created multiple classification models and a voting classifier
random_forest_clf = RandomForestClassifier(random_state=42)
gradient_boosting_clf = GradientBoostingClassifier(random_state=42)
ada_boost_clf = AdaBoostClassifier(random_state=42)
svc_clf = SVC(probability=True, random_state=42)
decision_tree_clf = DecisionTreeClassifier(random_state=42)
bagged_tree_clf = BaggingClassifier(estimator=DecisionTreeClassifier(), random_state=42)
xgb_clf = XGBClassifier(random_state=42) 

# Created the voting classifier using multiple base classifiers
voting_clf = VotingClassifier(
    estimators=[
        ('rf', random_forest_clf),
        ('gb', gradient_boosting_clf),
        ('ab', ada_boost_clf),
        ('svc', svc_clf),
        ('dt', decision_tree_clf),
        ('bt', bagged_tree_clf),
        ('xgb', xgb_clf)
    ],
    voting='soft'  # Used 'soft' for probability-based voting
)


In [26]:
# Trained the voting classifier on the training data
voting_clf.fit(x_train, y_train)

# Made predictions on the test data
y_pred = voting_clf.predict(x_test)

# Imported the classification_report function from sklearn.metrics

# Calculated the classification report to evaluate the model's performance
report = classification_report(y_test, y_pred)

# Printed the classification report to assess model performance
print(report)


              precision    recall  f1-score   support

           0       0.80      0.87      0.84       157
           1       0.79      0.69      0.74       111

    accuracy                           0.80       268
   macro avg       0.80      0.78      0.79       268
weighted avg       0.80      0.80      0.80       268



In [27]:
# Imported the accuracy_score function from sklearn.metrics

# Calculated the accuracy of the model's predictions
accuracy = accuracy_score(y_test, y_pred)

# Printed the accuracy score with two decimal places
print(f'Accuracy: {accuracy:.2f}')


Accuracy: 0.80


In [28]:
# Imported the pandas library and read the test dataset from a CSV file
fd = pd.read_csv('/kaggle/input/titanic/test.csv')


In [29]:
fd

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [None]:
#Applied the same transformations as above

In [30]:
df1 = fd.drop(columns=['Name', 'Ticket','PassengerId'])


In [31]:
df1['FamilySize'] = df1['SibSp'] + df1['Parch']

df1 = df1.drop(columns=['SibSp', 'Parch'])

In [32]:
deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}

df1['Cabin'] =df1['Cabin'].fillna("U0")
df1['Deck'] = df1['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
df1['Deck'] = df1['Deck'].map(deck)
df1['Deck'] = df1['Deck'].fillna(0)
df1['Deck'] = df1['Deck'].astype(int)
# we can now drop the cabin feature
df1 = df1.drop(columns=['Cabin'])

In [33]:
df1

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,FamilySize,Deck
0,3,male,34.5,7.8292,Q,0,8
1,3,female,47.0,7.0000,S,1,8
2,2,male,62.0,9.6875,Q,0,8
3,3,male,27.0,8.6625,S,0,8
4,3,female,22.0,12.2875,S,2,8
...,...,...,...,...,...,...,...
413,3,male,,8.0500,S,0,8
414,1,female,39.0,108.9000,C,0,3
415,3,male,38.5,7.2500,S,0,8
416,3,male,,8.0500,S,0,8


In [34]:
X1 = preprocessor.transform(df1[features])

In [35]:
# Used the trained voting classifier to make predictions on new data 'X1'
testans = voting_clf.predict(X1.toarray())



In [36]:
# Created a DataFrame 'res' containing 'PassengerId' and 'Survived' columns based on the predictions
res = pd.DataFrame({'PassengerId': fd['PassengerId'], 'Survived': testans})


In [37]:
res.shape

(418, 2)

In [38]:
res.to_csv('submission.csv', index=False)

In [39]:
res

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
