In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
import joblib

# Load the train and test datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

def preprocess_data(df):
    # Handle missing values
    imputer = SimpleImputer(strategy='median')
    df['Age'] = imputer.fit_transform(df[['Age']])
    df['Fare'] = imputer.fit_transform(df[['Fare']])
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    
    # Convert categorical variables to numerical
    df['Sex'] = LabelEncoder().fit_transform(df['Sex'])
    df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)
    
    # Drop columns that won't be used in the model
    df.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
    
    return df

train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

# Separate features and target variable from training data
X = train_df.drop(['Survived', 'PassengerId'], axis=1)
y = train_df['Survived']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save the model to a file
joblib.dump(model, 'titanic_model.pkl')

# Evaluate the model
scores = cross_val_score(model, X, y, cv=5)
print(f'Cross-validation scores: {scores}')
print(f'Average cross-validation score: {scores.mean()}')

# Prepare the test data
X_test = test_df.drop(['PassengerId'], axis=1)

# Predict using the trained model
predictions = model.predict(X_test)

# Create a DataFrame with the results
output = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived': predictions})

# Save the predictions to a CSV file
#output.to_csv('titanic_predictions.csv', index=False)


: 