In [23]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

# Load the datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Handle missing values
# Impute missing values in 'Age' with the median
age_imputer = SimpleImputer(strategy='median')
train_data['Age'] = age_imputer.fit_transform(train_data[['Age']])
test_data['Age'] = age_imputer.transform(test_data[['Age']])

# Impute missing values in 'Fare' with the median
fare_imputer = SimpleImputer(strategy='median')
train_data['Fare'] = fare_imputer.fit_transform(train_data[['Fare']])
test_data['Fare'] = fare_imputer.transform(test_data[['Fare']])

# Impute missing values in 'Embarked' with the most frequent value
embarked_imputer = SimpleImputer(strategy='most_frequent')
# Reshape the imputed values to a 1D array
train_data['Embarked'] = embarked_imputer.fit_transform(train_data[['Embarked']]).ravel()
test_data['Embarked'] = embarked_imputer.transform(test_data[['Embarked']]).ravel()

# Handle missing values in 'Cabin' (drop the column as it has too many missing values)
train_data = train_data.drop(columns=['Cabin'])
test_data = test_data.drop(columns=['Cabin'])

# Drop irrelevant columns
train_data = train_data.drop(columns=['Name', 'Ticket'])
test_data = test_data.drop(columns=['Name', 'Ticket'])

# Convert categorical columns into numerical format using one-hot encoding
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_train_sex_embarked = encoder.fit_transform(train_data[['Sex', 'Embarked']])
encoded_test_sex_embarked = encoder.transform(test_data[['Sex', 'Embarked']])

# Create DataFrame from encoded features and concatenate with the original data
encoded_columns = encoder.get_feature_names_out(['Sex', 'Embarked'])
encoded_train_df = pd.DataFrame(encoded_train_sex_embarked, columns=encoded_columns, index=train_data.index)
encoded_test_df = pd.DataFrame(encoded_test_sex_embarked, columns=encoded_columns, index=test_data.index)

train_data = pd.concat([train_data, encoded_train_df], axis=1)
test_data = pd.concat([test_data, encoded_test_df], axis=1)

# Drop the original categorical columns
train_data = train_data.drop(columns=['Sex', 'Embarked'])
test_data = test_data.drop(columns=['Sex', 'Embarked'])

# Normalize numerical columns
scaler = MinMaxScaler()
train_data[['Age', 'Fare']] = scaler.fit_transform(train_data[['Age', 'Fare']])
test_data[['Age', 'Fare']] = scaler.transform(test_data[['Age', 'Fare']])

# Feature Engineering: Create new feature 'FamilySize'
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1

# Display the first few rows of the processed train and test datasets
print(train_data.head())
print(test_data.head())


   PassengerId  Survived  Pclass       Age  SibSp  Parch      Fare  Sex_male  \
0            1         0       3  0.271174      1      0  0.014151       1.0   
1            2         1       1  0.472229      1      0  0.139136       0.0   
2            3         1       3  0.321438      0      0  0.015469       0.0   
3            4         1       1  0.434531      1      0  0.103644       0.0   
4            5         0       3  0.434531      0      0  0.015713       1.0   

   Embarked_Q  Embarked_S  FamilySize  
0         0.0         1.0           2  
1         0.0         0.0           2  
2         0.0         1.0           1  
3         0.0         1.0           2  
4         0.0         1.0           1  
   PassengerId  Pclass       Age  SibSp  Parch      Fare  Sex_male  \
0          892       3  0.428248      0      0  0.015282       1.0   
1          893       3  0.585323      1      0  0.013663       0.0   
2          894       2  0.773813      0      0  0.018909       1.0   

