In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
#load train,test
train_data=pd.read_csv('/kaggle/input/titanic/train.csv')
test_data=pd.read_csv('/kaggle/input/titanic/test.csv')


In [3]:
#view first few rows and get info of that dataset
train_data.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
# Drop the 'Name' column
train_data=train_data.drop('Name',axis=1)
test_data=test_data.drop('Name',axis=1)

#axis=0 → means "do operation along rows"
#axis=1 → means "do operation along columns"

In [6]:
# Convert 'Sex' column to numeric: male → 0, female → 1
train_data['Sex']=train_data['Sex'].map({'male':0,'female':1})
test_data['Sex']=test_data['Sex'].map({'male':0,'female':1})

In [7]:
#🔹 To see unique values (including nulls):

print(train_data['Sex'].value_counts(dropna=False))
print(test_data['Sex'].value_counts(dropna=False))

Sex
0    577
1    314
Name: count, dtype: int64
Sex
0    266
1    152
Name: count, dtype: int64


In [8]:
#🔹 To check if there are any missing (null) values:
print(train_data['Sex'].isnull().sum())
print(test_data['Sex'].isnull().sum())

0
0


In [9]:
# The 'Ticket' column contains messy alphanumeric strings which don't provide clear, usable information.
# While it may contain some hidden patterns (like prefixes), extracting them would require deep feature engineering.
# Fortunately, we already have the 'Pclass' column, which captures the passenger's class (1st, 2nd, 3rd),
# making 'Ticket' largely redundant for our current model.
# So, we will drop the 'Ticket' column from both datasets.

train_data=train_data.drop('Ticket',axis=1)
test_data=test_data.drop('Ticket',axis=1)

In [10]:
# The 'Cabin' column contains cabin numbers (like 'C85', 'E46', etc.).
# However, only 204 out of 891 entries have non-null values, meaning ~77% of the data is missing.
# While the deck letter (first character) might offer some insights,
# the large number of missing values makes this column unreliable without heavy preprocessing.
# So, for simplicity and better model performance, we'll drop the 'Cabin' column.
train_data=train_data.drop('Cabin',axis=1)
test_data=test_data.drop('Cabin',axis=1)

In [11]:
# Initially, we are not sure if the 'Embarked' column contains discrete or continuous values.
# So, we follow a few steps to understand its nature.

# Step 1: Check the data type of the column
print(train_data['Embarked'].dtype)
# If it's of type 'object' or 'category', it's likely to be discrete (categorical).
# If it's numeric (int or float), we need to investigate further.

# Step 2: Check the unique values in the column
print(train_data['Embarked'].unique())
# A small set of distinct values (like ['S', 'C', 'Q']) usually indicates discrete data.

# Step 3: Count the number of unique values
print(train_data['Embarked'].nunique())
# If the number is small and countable, it's discrete.
# A large number of unique values, especially with decimals, suggests continuous data.

# Step 4: View value counts including missing values
print(train_data['Embarked'].value_counts(dropna=False))
# This shows how many times each value (including NaN) appears, helping confirm whether the data is categorical.


object
['S' 'C' 'Q' nan]
3
Embarked
S      644
C      168
Q       77
NaN      2
Name: count, dtype: int64


In [12]:
#Since Embarked is categorical, you can't directly check correlation with Survived — but here are two helpful tricks:

# a) Survival rate by each port:
print(train_data.groupby('Embarked')['Survived'].mean())


Embarked
C    0.553571
Q    0.389610
S    0.336957
Name: Survived, dtype: float64


In [13]:
#🔍 b) Count of passengers per port:
print(train_data['Embarked'].value_counts())


Embarked
S    644
C    168
Q     77
Name: count, dtype: int64


In [14]:
# Before encoding, we want to understand how the 'Embarked' feature affects the target variable 'Survived'.
# Let's assume:
# - If 100 passengers embarked from 'S', and only 10 survived → survival rate = 10%
# - If 10 passengers from 'C', and 5 survived → survival rate = 50%
# - If 500 passengers from 'Q', and 50 survived → survival rate = 10%
# This shows that passengers from 'C' had a better chance of survival.

# So we calculate the survival rate for each port using groupby and mean:
# This gives us the average of the 'Survived' column for each 'Embarked' category
# (Since Survived is 0 or 1, the mean directly gives the survival rate)

print(train_data.groupby('Embarked')['Survived'].mean())

# The result tells us:
# - Which port had higher survival rates
# - Which categories may be more influential in predicting survival

# This insight can help us decide whether to encode the column using One-Hot Encoding (which preserves category identity)
# or Label Encoding (which may accidentally imply order — which we do not want if 'C' is safer than 'S', but 'S' has a higher label).


Embarked
C    0.553571
Q    0.389610
S    0.336957
Name: Survived, dtype: float64


In [15]:
# We want to analyze how many passengers boarded from each port ('Embarked') 
# and how many of them survived — both in raw counts and percentages.

# First, create a summary DataFrame using groupby
embarked_stats = train_data.groupby('Embarked')['Survived'].agg(['count', 'sum'])

# Rename columns for clarity
embarked_stats.columns = ['Total_Passengers', 'Survived_Passengers']

# Calculate survival percentage
embarked_stats['Survival_Rate (%)'] = (embarked_stats['Survived_Passengers'] / embarked_stats['Total_Passengers']) * 100

# Display the results
print(embarked_stats)


          Total_Passengers  Survived_Passengers  Survival_Rate (%)
Embarked                                                          
C                      168                   93          55.357143
Q                       77                   30          38.961039
S                      644                  217          33.695652


In [16]:
# Based on survival rates, we assign ordinal labels:
# C (highest survival rate) → 2
# Q (medium) → 1
# S (lowest) → 0

embarked_mapping = {'S': 0, 'Q': 1, 'C': 2}

# Apply the mapping to both train and test datasets
train_data['Embarked'] = train_data['Embarked'].map(embarked_mapping)
test_data['Embarked'] = test_data['Embarked'].map(embarked_mapping)


In [17]:
# Fill missing values in 'Embarked' with the mode (most frequent port)
# Since we already label-encoded 'Embarked', mode() will return the most frequent number (e.g., 0 for 'S')

train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
test_data['Embarked'].fillna(test_data['Embarked'].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['Embarked'].fillna(test_data['Embarked'].mode()[0], inplace=True)


In [18]:
# Check how many missing (null) values exist in each column of the training dataset
print("Missing values in training data:")
print(train_data.isnull().sum())

# Check the same for the test dataset
print("\nMissing values in test data:")
print(test_data.isnull().sum())


Missing values in training data:
PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Embarked         0
dtype: int64

Missing values in test data:
PassengerId     0
Pclass          0
Sex             0
Age            86
SibSp           0
Parch           0
Fare            1
Embarked        0
dtype: int64


In [19]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    int64  
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Embarked     891 non-null    float64
dtypes: float64(3), int64(6)
memory usage: 62.8 KB


In [20]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Sex          418 non-null    int64  
 3   Age          332 non-null    float64
 4   SibSp        418 non-null    int64  
 5   Parch        418 non-null    int64  
 6   Fare         417 non-null    float64
 7   Embarked     418 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 26.3 KB


In [21]:
# Fill the single missing value in 'Fare' with the median fare in the test dataset
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)


In [22]:
# Fill missing Age in train set using median age grouped by Pclass and Sex
train_data['Age'] = train_data.groupby(['Pclass', 'Sex'])['Age'].transform(
    lambda x: x.fillna(x.median())
)

# Apply the same for test data
test_data['Age'] = test_data.groupby(['Pclass', 'Sex'])['Age'].transform(
    lambda x: x.fillna(x.median())
)


In [23]:
test_data.info()
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Sex          418 non-null    int64  
 3   Age          418 non-null    float64
 4   SibSp        418 non-null    int64  
 5   Parch        418 non-null    int64  
 6   Fare         418 non-null    float64
 7   Embarked     418 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 26.3 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    int64  
 4   Age          891 non-null    float64
 5   SibSp        891 non-n

In [24]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Columns that need scaling (continuous numerical values)
columns_to_scale = ['Age', 'Fare']

# Fit on train, transform both train and test
train_data[columns_to_scale] = scaler.fit_transform(train_data[columns_to_scale])
test_data[columns_to_scale] = scaler.transform(test_data[columns_to_scale])


In [25]:
# Separate features and target variable
X_train = train_data.drop('Survived', axis=1)
y_train = train_data['Survived']


In [26]:
from sklearn.linear_model import LogisticRegression

# Initialize model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
# Predict survival on the test dataset
predictions = model.predict(test_data)


In [28]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

# Split train data into training and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train on split
model.fit(X_train_split, y_train_split)

# Predict on validation set
val_preds = model.predict(X_val)

# Accuracy and confusion matrix
print("Accuracy:", accuracy_score(y_val, val_preds))
print("Confusion Matrix:\n", confusion_matrix(y_val, val_preds))


Accuracy: 0.8100558659217877
Confusion Matrix:
 [[90 15]
 [19 55]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
# Create submission DataFrame
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': predictions
})


In [30]:
# Save to CSV (no index!)
submission.to_csv('submission.csv', index=False)


In [31]:
# Load your submission file (just to confirm its format)
my_submission = pd.read_csv('/kaggle/working/submission.csv')

# Load the sample submission provided by Kaggle
sample_submission = pd.read_csv('/kaggle/input/titanic/gender_submission.csv')

# Display first few rows of both
print("🔹 My Submission:")
print(my_submission.head())

print("\n🔸 Sample Submission from Kaggle:")
print(sample_submission.head())


🔹 My Submission:
   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         1

🔸 Sample Submission from Kaggle:
   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         1


In [32]:
# Check how many predicted values match the sample
matching_predictions = (my_submission['Survived'] == sample_submission['Survived']).sum()
total = len(my_submission)

print(f"\n✅ Matching predictions with sample: {matching_predictions} out of {total}")



✅ Matching predictions with sample: 395 out of 418
