In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost.import XGBClassifier
from sklearn.metrics import (accuracy_score, confusion_matrix, 
                           classification_report, roc_auc_score)
import warnings
warnings.filterwarnings('ignore')

# Load and inspect data
df = pd.read_csv('titanic.csv')  # Load your CSV file

# Data preprocessing
def preprocess_data(df):
    # Handle missing values
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    
    # Feature engineering
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = np.where(df['FamilySize'] > 1, 0, 1)
    df['AgeBin'] = pd.cut(df['Age'], bins=[0, 12, 18, 65, 100], labels=[0, 1, 2, 3])
    
    # Convert categorical features
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
    
    # Drop unnecessary columns
    df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
    
    return df

processed_df = preprocess_data(df)

# Split data into features and target
X = processed_df.drop('Survived', axis=1)
y = processed_df['Survived']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier()
}

# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_val_scaled)
    y_proba = model.predict_proba(X_val_scaled)[:, 1]
    
    results[name] = {
        'accuracy': accuracy_score(y_val, y_pred),
        'roc_auc': roc_auc_score(y_val, y_proba)
    }

# Display results
results_df = pd.DataFrame(results).T
print("\nModel Performance Comparison:")
print(results_df)

# Best model selection
best_model_name = results_df['roc_auc'].idxmax()
best_model = models[best_model_name]
print(f"\nBest Model: {best_model_name}")

# Feature importance visualization (for tree-based models)
if hasattr(best_model, 'feature_importances_'):
    feature_importances = pd.Series(best_model.feature_importances_, 
                                   index=X.columns)
    feature_importances.nlargest(10).plot(kind='barh')
    plt.title('Feature Importances')
    plt.show()

# Confusion matrix
y_pred = best_model.predict(X_val_scaled)
cm = confusion_matrix(y_val, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Classification report
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

# Hyperparameter tuning example (XGBoost)
if isinstance(best_model, XGBClassifier):
    param_grid = {
        'max_depth': [3, 5, 7],
        'learning_rate': [0.1, 0.01],
        'n_estimators': [100, 200]
    }
    
    grid_search = GridSearchCV(XGBClassifier(), param_grid, 
                              cv=5, scoring='roc_auc')
    grid_search.fit(X_train_scaled, y_train)
    
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best ROC-AUC: {grid_search.best_score_:.4f}")

# Save the best model
import joblib
joblib.dump(best_model, 'best_titanic_model.pkl')
print("\nModel saved as 'best_titanic_model.pkl'")

ModuleNotFoundError: No module named 'xgboost'

In [6]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.5/124.9 MB 3.4 MB/s eta 0:00:38
   ---------------------------------------- 1.3/124.9 MB 4.0 MB/s eta 0:00:32
    --------------------------------------- 2.1/124.9 MB 3.9 MB/s eta 0:00:32
   - -------------------------------------- 3.1/124.9 MB 3.8 MB/s eta 0:00:33
   - -------------------------------------- 4.2/124.9 MB 3.9 MB/s eta 0:00:31
   - -------------------------------------- 5.0/124.9 MB 4.1 MB/s eta 0:00:30
   - -------------------------------------- 6.0/124.9 MB 4.1 MB/s eta 0:00:29
   -- ------------------------------------- 6.8/124.9 MB 4.0 MB/s eta 0:00:30
   -- ------------------------------------- 7.6/124.9 MB 4.1 MB/s eta 0:00:29
   -- ------------------------------------- 8.4/124.9 MB 4.0 MB/s eta 0:00:30
 