In [244]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Step 1: Data Preprocessing

# Fill missing values for 'Age', 'Fare' using median
imputer_age_fare = SimpleImputer(strategy='median')
train_data['Age'] = imputer_age_fare.fit_transform(train_data[['Age']])
test_data['Age'] = imputer_age_fare.transform(test_data[['Age']])
train_data['Fare'] = imputer_age_fare.fit_transform(train_data[['Fare']])
test_data['Fare'] = imputer_age_fare.transform(test_data[['Fare']])

# Encode categorical variables 'Sex' and 'Embarked'
label_encoder_sex = LabelEncoder()
train_data['Sex'] = label_encoder_sex.fit_transform(train_data['Sex'])
test_data['Sex'] = label_encoder_sex.transform(test_data['Sex'])

label_encoder_embarked = LabelEncoder()
train_data['Embarked'] = label_encoder_embarked.fit_transform(train_data['Embarked'])
test_data['Embarked'] = label_encoder_embarked.transform(test_data['Embarked'])

# Handle 'Embarked' missing values using most frequent strategy
imputer_embarked = SimpleImputer(strategy='most_frequent')
train_data['Embarked'] = imputer_embarked.fit_transform(train_data[['Embarked']])
test_data['Embarked'] = imputer_embarked.transform(test_data[['Embarked']])

# Fill missing 'Cabin' with a placeholder (optional, not used as a feature)
train_data['Cabin'] = train_data['Cabin'].fillna('U')
test_data['Cabin'] = test_data['Cabin'].fillna('U')

# Step 2: Feature Engineering

# 1. Family Size: Combine 'SibSp' and 'Parch' and add 1 to include the passenger
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1

# 2. Extract Title from Name
train_data['Title'] = train_data['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())
test_data['Title'] = test_data['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())

# Group low-frequency titles together
rare_titles = ['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
train_data['Title'] = train_data['Title'].replace(rare_titles, 'Rare')
test_data['Title'] = test_data['Title'].replace(rare_titles, 'Rare')

# 3. IsAlone: 1 if family size is 1 (meaning the passenger is alone), 0 otherwise
train_data['IsAlone'] = (train_data['FamilySize'] == 1).astype(int)
test_data['IsAlone'] = (test_data['FamilySize'] == 1).astype(int)

# 4. Fare Per Person
train_data['FarePerPerson'] = train_data['Fare'] / train_data['FamilySize']
test_data['FarePerPerson'] = test_data['Fare'] / test_data['FamilySize']

# 5. Age Bins: Create age bins
train_data['AgeBin'] = pd.cut(train_data['Age'], bins=[0, 12, 18, 35, 60, 120], labels=[0, 1, 2, 3, 4])
test_data['AgeBin'] = pd.cut(test_data['Age'], bins=[0, 12, 18, 35, 60, 120], labels=[0, 1, 2, 3, 4])

# 6. Extract Deck from Cabin
train_data['Deck'] = train_data['Cabin'].apply(lambda cabin: cabin[0])
test_data['Deck'] = test_data['Cabin'].apply(lambda cabin: cabin[0])

# Encode categorical variables 'Sex', 'Embarked', 'Title', 'Deck'
label_encoder = LabelEncoder()
for col in ['Sex', 'Embarked', 'Title', 'Deck']:
    train_data[col] = label_encoder.fit_transform(train_data[col])
    test_data[col] = label_encoder.transform(test_data[col])

# Step 3: Feature selection
features = ['Pclass', 'Sex', 'AgeBin', 'SibSp', 'Parch', 'FarePerPerson', 'Embarked', 'FamilySize', 'IsAlone', 'Title', 'Deck']

X = train_data[features]
y = train_data['Survived']
X_test = test_data[features]

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Train the RandomForest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Validate the model
y_pred_val = rf_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_pred_val)
print(f"Validation Accuracy: {val_accuracy * 100:.2f}%")

# Step 5: Predict on the test data
y_pred_test = rf_model.predict(X_test)

# Optionally, save predictions in a CSV file for submission or further use
output = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': y_pred_test})
output.to_csv('titanic_predictions_with_features.csv', index=False)

print("Predictions saved to titanic_predictions_with_features.csv")


Validation Accuracy: 84.92%
Predictions saved to titanic_predictions_with_features.csv
