In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [3]:
train_path = '/kaggle/input/titanic/train.csv'
train_data = pd.read_csv(train_path)
test_path = '/kaggle/input/titanic/test.csv'
test_data = pd.read_csv(test_path)

In [4]:
train_data['Sex'] = train_data['Sex'].map({'male': 0, 'female': 1})
test_data['Sex'] = test_data['Sex'].map({'male': 0, 'female': 1})

In [5]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


In [6]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,S


In [7]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,0.352413,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,0.47799,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,0.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,0.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,1.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,1.0,80.0,8.0,6.0,512.3292


In [8]:
test_data.describe()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,0.363636,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,0.481622,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,0.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,0.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,1.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,1.0,76.0,8.0,9.0,512.3292


In [9]:
test_data['Fare'] = test_data['Fare'].fillna(14.454200)

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

# Encode 'Sex' column to numeric
label_encoder = LabelEncoder()
train_data['Sex'] = label_encoder.fit_transform(train_data['Sex'])
test_data['Sex'] = label_encoder.transform(test_data['Sex'])  # Apply the same encoding to test_data

# Separate rows with and without missing Age values in train_data
age_train = train_data[train_data['Age'].notna()]
age_test = train_data[train_data['Age'].isna()]

# Features to predict Age
features_age = ['Pclass', 'Sex', 'Parch', 'Fare', 'SibSp']

# Prepare the training data for age imputation
X_train = age_train[features_age]
y_train = age_train['Age']

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict missing Age values in train_data
age_test['Age'] = model.predict(age_test[features_age])

# Combine the datasets back for train_data
train_data.loc[train_data['Age'].isna(), 'Age'] = age_test['Age']

# Confirm there are no missing values in Age in train_data
print(f"Missing Age values in train_data after filling: {train_data['Age'].isna().sum()}")

# Now, apply the same method to test_data (predict missing Age values)
# For test_data, we don't have the target 'Age' column, so we just predict for the missing values
test_data_age_missing = test_data[test_data['Age'].isna()]

# Predict missing Age values in test_data
test_data_age_missing['Age'] = model.predict(test_data_age_missing[features_age])

# Combine the datasets back for test_data
test_data.loc[test_data['Age'].isna(), 'Age'] = test_data_age_missing['Age']

# Confirm there are no missing values in Age in test_data
print(f"Missing Age values in test_data after filling: {test_data['Age'].isna().sum()}")


Missing Age values in train_data after filling: 0
Missing Age values in test_data after filling: 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  age_test['Age'] = model.predict(age_test[features_age])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data_age_missing['Age'] = model.predict(test_data_age_missing[features_age])


In [11]:
missing_stats = test_data.isna().sum()
print(missing_stats[missing_stats > 0])

Cabin    327
dtype: int64


In [12]:
missing_stats = train_data.isna().sum()
print(missing_stats[missing_stats > 0])

Cabin       687
Embarked      2
dtype: int64


In [13]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


In [14]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,S


In [15]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Assuming train_data and test_data are your DataFrames

# Initialize OneHotEncoder, without dropping any category (keep all 3 categories)
encoder = OneHotEncoder(drop=None, sparse=False)

# Apply OneHotEncoder to the 'Embarked' column for both train and test data
train_embarked_encoded = pd.DataFrame(encoder.fit_transform(train_data[['Embarked']]))
test_embarked_encoded = pd.DataFrame(encoder.transform(test_data[['Embarked']]))

# Set proper column names for the encoded columns
train_embarked_encoded.columns = encoder.get_feature_names_out(['Embarked'])
test_embarked_encoded.columns = encoder.get_feature_names_out(['Embarked'])

# Now, drop the old 'Embarked' column and join the encoded columns
train_data = train_data.drop('Embarked', axis=1).join(train_embarked_encoded)
test_data = test_data.drop('Embarked', axis=1).join(test_embarked_encoded)

# Print the updated train and test data
print(train_data.head())
print(test_data.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    0  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    1  38.0      1      0   
2                             Heikkinen, Miss. Laina    1  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    1  35.0      1      0   
4                           Allen, Mr. William Henry    0  35.0      0      0   

             Ticket     Fare Cabin  Embarked_C  Embarked_Q  Embarked_S  \
0         A/5 21171   7.2500   NaN         0.0         0.0         1.0   
1          PC 17599  71.2833   C85         1.0         0.0         0.0   
2  STON/O2. 3101282   7.9250   NaN         0.0         0.0         1.0   




In [16]:
features = ['Sex', 'Pclass', 'Age', 'Fare', 'Embarked_Q', 'Embarked_S', 'Embarked_C']

X_train = train_data[features]
y_train = train_data['Survived'] 

X_test = test_data[features]

minmax_scaler = MinMaxScaler()
X_train_scaled = minmax_scaler.fit_transform(X_train)
X_test_scaled = minmax_scaler.transform(X_test)

In [17]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential([
    # the hidden ReLU layers
    layers.Dense(units=512, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dense(units=512, activation='relu'),
    layers.Dense(units=512, activation='relu'),
    # the linear output layer 
    layers.Dense(units=1),
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [18]:
model.compile(
    optimizer='adam',
    loss='mae',
)

In [19]:
history = model.fit(
    X_train_scaled, y_train,
    batch_size=2048,
    epochs=100,
)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - loss: 0.4084
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - loss: 0.4473
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - loss: 0.4370
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step - loss: 0.4029
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - loss: 0.3574
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - loss: 0.3177
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - loss: 0.2768
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - loss: 0.2765
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - loss: 0.2783
Epoch 10/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - loss: 0.2631
Epoch 11/10

In [20]:
predictions = model.predict(X_test_scaled)
predictions_flat = predictions.reshape(-1)  # Ensure 1D
final = (predictions >= 0).astype(int)
final_flat = final.reshape(-1)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


In [21]:
output = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],  
    'Survived': final_flat})

output.to_csv('titanic_submission.csv', index=False)