In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import pickle

# Create and save model as pickle file

In [2]:


# Load Titanic dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
data = pd.read_csv(url)

# Drop unnecessary columns and handle missing values
X = data.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin', 'Sex', 'Embarked', 'Survived'])
y = data['Survived']

# Fill missing values with the mean
X = X.fillna(X.mean())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the RandomForest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Save the model to a pickle file
with open('titanic_model.pkl', 'wb') as f:
    pickle.dump(model, f)

print("Model saved as 'titanic_model.pkl'")


Model saved as 'titanic_model.pkl'


# Predict using the model

In [4]:
# Load the model from the pickle file
with open('titanic_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# Make predictions with the loaded model
predictions = loaded_model.predict(X_test_scaled)
# Compare predictions with the actual test labels (y_test)
comparison = (predictions == y_test.values)

# Create a DataFrame to display X_test_scaled along with predictions and True/False comparison
df = pd.DataFrame(X_test_scaled, columns=X_train.columns)  # Use same columns as X_train
df['Prediction'] = predictions
df['Actual'] = y_test.values
df['Correct'] = comparison

# Display the DataFrame (first 10 rows)
print(df.head(10))


     Pclass       Age     SibSp     Parch      Fare  Prediction  Actual  \
0  0.813034  0.012390  0.379923  0.784700 -0.333901           0       1   
1 -0.400551  0.112570 -0.470722 -0.479342 -0.425284           0       0   
2  0.813034 -0.734533 -0.470722 -0.479342 -0.474867           0       0   
3 -0.400551 -1.812666 -0.470722  0.784700  0.007966           1       1   
4  0.813034 -1.196590  0.379923 -0.479342 -0.411002           0       1   
5 -1.614136 -0.272477 -0.470722 -0.479342  0.890834           1       1   
6  0.813034  0.012390 -0.470722 -0.479342 -0.478237           0       1   
7  0.813034 -1.042571  1.230569 -0.479342 -0.280867           0       0   
8  0.813034 -1.042571 -0.470722 -0.479342 -0.478237           1       1   
9 -1.614136 -0.811543 -0.470722  2.048742 -0.121367           1       1   

   Correct  
0    False  
1     True  
2     True  
3     True  
4    False  
5     True  
6    False  
7     True  
8     True  
9     True  


# Evaluate the model

In [6]:


# Make predictions with the loaded model
predictions = loaded_model.predict(X_test_scaled)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)

# Print confusion matrix
print("Confusion Matrix:")
print(conf_matrix)

# Calculate and print other performance metrics
accuracy = accuracy_score(y_test, predictions)
print(f"\nAccuracy: {accuracy:.4f}")

# Classification report (includes precision, recall, F1-score)
class_report = classification_report(y_test, predictions, target_names=['Did not survive', 'Survived'])
print("\nClassification Report:")
print(class_report)


Confusion Matrix:
[[86 19]
 [31 43]]

Accuracy: 0.7207

Classification Report:
                 precision    recall  f1-score   support

Did not survive       0.74      0.82      0.77       105
       Survived       0.69      0.58      0.63        74

       accuracy                           0.72       179
      macro avg       0.71      0.70      0.70       179
   weighted avg       0.72      0.72      0.72       179

