<a href="https://colab.research.google.com/github/samer-glitch/Trustworthy-AI-Data-Pipeline-Framework/blob/main/8_Model_Training_and_Validation_With_and_Without_techniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Load dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
df = pd.read_csv(url)

# Handle non-numeric data
df = df.select_dtypes(exclude=['object'])  # Drop non-numeric columns for simplicity
df = df.dropna()  # Drop rows with missing values

X = df.drop('Survived', axis=1)
y = df['Survived']

# Define K-Fold cross-validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)
model = RandomForestClassifier(random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV accuracy: {cv_scores.mean()}")

Cross-validation scores: [0.67132867 0.71328671 0.6993007  0.74825175 0.74647887]
Mean CV accuracy: 0.7157293410814537


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ensure both training and testing sets have consistent features
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = (y_pred == y_test).mean()
print(f"Test accuracy without cross-validation: {accuracy}")

Test accuracy without cross-validation: 0.6643356643356644


In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for RandomForest
param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 10]}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")

Best parameters: {'max_depth': 5, 'n_estimators': 50}
Best cross-validation score: 0.7325322564759184


In [None]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = (y_pred == y_test).mean()
print(f"Model accuracy without hyperparameter tuning: {accuracy}")

Model accuracy without hyperparameter tuning: 0.6643356643356644


In [None]:
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_auc_score
from sklearn.model_selection import train_test_split

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{cm}")

# Precision-Recall Curve and AUC
y_proba = model.predict_proba(X_test)[:, 1]
precision, recall, _ = precision_recall_curve(y_test, y_proba)
roc_auc = roc_auc_score(y_test, y_proba)
print(f"ROC-AUC Score: {roc_auc}")

Confusion Matrix:
[[70 17]
 [31 25]]
ROC-AUC Score: 0.7277298850574713


In [None]:
accuracy = (y_pred == y_test).mean()
print(f"Accuracy without thorough evaluation: {accuracy}")

Accuracy without thorough evaluation: 0.6643356643356644


In [None]:
from sklearn.preprocessing import OneHotEncoder

# Handle categorical features using OneHotEncoder
categorical_columns = df.select_dtypes(include=['object']).columns
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_array = encoder.fit_transform(df[categorical_columns])

# Convert encoded categorical features to a DataFrame
encoded_df = pd.DataFrame(encoded_array, index=df.index, columns=encoder.get_feature_names_out(categorical_columns))

# Drop original categorical columns from X and concatenate the encoded features
X = df.drop(categorical_columns, axis=1)
X = pd.concat([X, encoded_df], axis=1)

# Train model on preprocessed data
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = (y_pred == y_test).mean()
print(f"Model Accuracy with Preprocessed Data: {accuracy}")

Model Accuracy with Preprocessed Data: 1.0


In [None]:
# Train model without preprocessing
model = RandomForestClassifier(random_state=42)
try:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = (y_pred == y_test).mean()
    print(f"Model accuracy without preprocessing: {accuracy}")
except ValueError as e:
    print(f"Error during training without preprocessing: {e}")

Model accuracy without preprocessing: 1.0
