In [3]:
# titanic_model.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import os

In [6]:
os.chdir('../')

In [7]:
os.getcwd()

'/Users/akshika47/Documents/GitHub/mlops'

In [9]:
train_data = pd.read_csv(os.path.join('data', 'train.csv'))
test_data = pd.read_csv(os.path.join('data', 'test.csv'))

In [11]:


# Load data
def load_data():
    train_data = pd.read_csv(os.path.join('data', 'train.csv'))
    test_data = pd.read_csv(os.path.join('data', 'test.csv'))
    return train_data, test_data

# Preprocess data
def preprocess_data(train_data, test_data):
    # Drop unnecessary columns
    train_data = train_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    test_data = test_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    
    # Split features and target
    X = train_data.drop('Survived', axis=1)
    y = train_data['Survived']
    
    # Define preprocessing steps
    numeric_features = ['Age', 'Fare']
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_features = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    return preprocessor, X, y, test_data

# Train model
def train_model(preprocessor, X, y):
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f'Validation Accuracy: {accuracy:.2f}')
    print('Confusion Matrix:')
    print(confusion_matrix(y_val, y_pred))
    print('Classification Report:')
    print(classification_report(y_val, y_pred))
    
    return model

# Predict on test data
def predict(model, test_data):
    predictions = model.predict(test_data)
    return predictions

# Main function
def main():
    train_data, test_data = load_data()
    preprocessor, X, y, test_data = preprocess_data(train_data, test_data)
    model = train_model(preprocessor, X, y)
    predictions = predict(model, test_data)
    print(f'Predictions: {predictions}')

if __name__ == '__main__':
    main()

Validation Accuracy: 0.82
Confusion Matrix:
[[91 14]
 [18 56]]
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       105
           1       0.80      0.76      0.78        74

    accuracy                           0.82       179
   macro avg       0.82      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179

Predictions: [0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 1 0 1 0 1 1 0 1 0 1 0 1 1 1 0 0 0 0 0 1 1 0
 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 0 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 0 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 0 0 1 0 0 1 0 0 1 0 1 1 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 1 1 0 0 1 0 0 0 1 0 0 1 0 1 1 1 1 1 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 