In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
train_path = '/kaggle/input/titanic/train.csv'
train_data = pd.read_csv(train_path)
test_path = '/kaggle/input/titanic/test.csv'
test_data = pd.read_csv(test_path)

In [None]:
train_data['Sex'] = train_data['Sex'].map({'male': 0, 'female': 1})
test_data['Sex'] = test_data['Sex'].map({'male': 0, 'female': 1})

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
train_data.describe()

In [None]:
test_data.describe()

In [None]:
#test_data['Fare'] = test_data['Fare'].fillna(14.454200)

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

# Encode 'Sex' column to numeric
label_encoder = LabelEncoder()
train_data['Sex'] = label_encoder.fit_transform(train_data['Sex'])
test_data['Sex'] = label_encoder.transform(test_data['Sex'])  # Apply the same encoding to test_data

# Separate rows with and without missing Age values in train_data
age_train = train_data[train_data['Age'].notna()]
age_test = train_data[train_data['Age'].isna()]

# Features to predict Age
features_age = ['Pclass', 'Sex', 'Parch', 'Fare', 'SibSp']

# Prepare the training data for age imputation
X_train = age_train[features_age]
y_train = age_train['Age']

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict missing Age values in train_data
age_test['Age'] = model.predict(age_test[features_age])

# Combine the datasets back for train_data
train_data.loc[train_data['Age'].isna(), 'Age'] = age_test['Age']

# Confirm there are no missing values in Age in train_data
print(f"Missing Age values in train_data after filling: {train_data['Age'].isna().sum()}")

# Now, apply the same method to test_data (predict missing Age values)
# For test_data, we don't have the target 'Age' column, so we just predict for the missing values
test_data_age_missing = test_data[test_data['Age'].isna()]

# Predict missing Age values in test_data
test_data_age_missing['Age'] = model.predict(test_data_age_missing[features_age])

# Combine the datasets back for test_data
test_data.loc[test_data['Age'].isna(), 'Age'] = test_data_age_missing['Age']

# Confirm there are no missing values in Age in test_data
print(f"Missing Age values in test_data after filling: {test_data['Age'].isna().sum()}")


In [None]:
missing_stats = test_data.isna().sum()
print(missing_stats[missing_stats > 0])

In [None]:
missing_stats = train_data.isna().sum()
print(missing_stats[missing_stats > 0])

In [None]:
# from sklearn.preprocessing import OneHotEncoder
# import pandas as pd

# # Assuming train_data and test_data are your DataFrames

# # Initialize OneHotEncoder, without dropping any category (keep all 3 categories)
# encoder = OneHotEncoder(drop=None, sparse=False)

# # Apply OneHotEncoder to the 'Embarked' column for both train and test data
# train_embarked_encoded = pd.DataFrame(encoder.fit_transform(train_data[['Embarked']]))
# test_embarked_encoded = pd.DataFrame(encoder.transform(test_data[['Embarked']]))

# # Set proper column names for the encoded columns
# train_embarked_encoded.columns = encoder.get_feature_names_out(['Embarked'])
# test_embarked_encoded.columns = encoder.get_feature_names_out(['Embarked'])

# # Now, drop the old 'Embarked' column and join the encoded columns
# train_data = train_data.drop('Embarked', axis=1).join(train_embarked_encoded)
# test_data = test_data.drop('Embarked', axis=1).join(test_embarked_encoded)

# # Print the updated train and test data
# print(train_data.head())
# print(test_data.head())

In [None]:
features = ['Sex', "Pclass", 'Age', 'Fare', 'Parch', 'Embarked']
# not sure how important EMBARKED IS

In [None]:
X_train = train_data[features]
X_test = test_data[features]

y = train_data.Survived

In [None]:
# Define the pipeline and preprocessing steps
numerical_transformer = SimpleImputer(strategy='mean')  # Replace NaN with mean for numerical columns

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Replace NaN with most frequent for categorical columns
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encoding for categorical variables
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, ['Age', 'Fare']),  # Replace with numerical column names
        ('cat', categorical_transformer, ['Embarked'])   # Replace with categorical column names
    ])

my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=1))  # Random forest model
])

# Apply the pipeline for fitting and predictions
my_pipeline.fit(X_train, y)  # Fit the pipeline on training data
predictions = my_pipeline.predict(X_test)  # Predict on test data

# Convert predictions to binary format (e.g., 0 or 1 for Survived)
rf_val_final = (predictions > 0.5).astype(int)  # Adjust threshold if needed

# Save the predictions to a CSV file
output = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],  # Use PassengerId from the test dataset
    'Survived': rf_val_final  # Add predictions
})

output.to_csv('titanic_submission.csv', index=False)  # Save to CSV for submission
