In [None]:
#Cross Validation

üéØ When to Use CV?
‚úÖ Anytime you're:

Testing many models

Tuning hyperparameters

Publishing a real ML project

In [5]:
import pandas as pd

df = pd.read_csv("titanic_cleaned.csv")
df = df.drop('Embarked_C',axis = 1)
#Define features and target
X= df.drop('Survived',axis = 1)
y = df['Survived']


In [None]:
#Why?

#Train Test Split gives result on 1 random sample but,
#Cross Validation (CV) tests model on multiple folds to make sure it generalizes well

#whats multiple folds????

| Benefit                      | Why it matters                                           |
| ---------------------------- | -------------------------------------------------------- |
| **More Reliable Scores**     | Avoids misleading results from a single train-test split |
| **Detects Overfitting**      | Shows if model performs well only on training folds      |
| **Works on Small Datasets**  | Makes the most of limited data                           |
| **Improves Model Selection** | Helps pick the best model based on average performance   |


In [7]:
from sklearn.model_selection import cross_val_score

#we will get accuracy on cross-validating Logistic Regression model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter= 1000)

scores = cross_val_score(model , X , y, cv= 5) #cv = 5 folds cross validation

print('CV Scores:',scores)
print('Average CV Score', scores.mean())

#Why cv=5?
#Splits data into 5 parts ‚Üí trains on 4, tests on 1 ‚Üí repeats 5 times

#cv=5 is a safe, standard default

CV Scores: [0.78089888 0.78651685 0.78651685 0.76966292 0.81355932]
Average CV Score 0.7874309655303752


In [43]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()

scores = cross_val_score(model,X,y,cv=5)

print("CV Scores:",scores)
print("Average CV Score:",scores.mean())

CV Scores: [0.56179775 0.76966292 0.8258427  0.76404494 0.83050847]
Average CV Score: 0.7503713578366026


In [44]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

scores = cross_val_score(model,X,y,cv=5)

print("CV Scores:",scores)
print("Average CV Score:",scores.mean())

CV Scores: [0.70224719 0.80337079 0.84831461 0.80898876 0.85875706]
Average CV Score: 0.8043356820923


üÜö Alternatives to cross_val_score?

| Option            | When to Use                                                          |
| ----------------- | -------------------------------------------------------------------- |
| `cross_val_score` | Quick accuracy check across folds                                    |
| `cross_validate`  | If you want multiple metrics (e.g., precision, recall)               |
| `StratifiedKFold` | If data is imbalanced and you want each fold to preserve class ratio |
| `ShuffleSplit`    | If data is not randomly distributed                                  |


In [13]:
from sklearn.model_selection import cross_validate
scores = cross_validate(model , X , y, cv= 5, return_train_score = True) #cv = 5 folds cross validation

print('CV Scores:',scores)
#print('Average CV Score', scores.mean())   does not have mean() attribute
print(scores['test_score'])   # Test scores
print(scores['train_score'])  # Training scores

#Why use cross_validate?
#Gives both train and test scores
#Can return fit times, scoring multiple metrics, etc.

#how does it returns pricison and recall?

CV Scores: {'fit_time': array([0.6504848 , 0.4265151 , 0.30633259, 0.56457138, 0.48725533]), 'score_time': array([0.00806141, 0.00615668, 0.0086143 , 0.00483155, 0.00511217]), 'test_score': array([0.78089888, 0.78651685, 0.78651685, 0.76966292, 0.81355932]), 'train_score': array([0.80590717, 0.80309423, 0.80028129, 0.8045007 , 0.79494382])}
[0.78089888 0.78651685 0.78651685 0.76966292 0.81355932]
[0.80590717 0.80309423 0.80028129 0.8045007  0.79494382]


In [16]:
#startified K fold (for classification)
#StratifiedKFold ensures that each fold has the same proportion of classes as the whole dataset (useful when classes are imbalanced).

from sklearn.model_selection import StratifiedKFold,cross_validate

skf = StratifiedKFold(n_splits = 5)

scores = cross_validate(model, X, y, cv = skf, return_train_score = True)

print(scores)
print(scores['test_score'])   # Test scores
print(scores['train_score']) 


#When to use ??

#our target (y) has unequal class counts
#Eg: 90% class A, 10% class B ‚Üí random split may miss class B

{'fit_time': array([0.52589631, 0.34317136, 0.26671958, 0.34599519, 0.27939725]), 'score_time': array([0.00463152, 0.00474763, 0.00550771, 0.00444198, 0.00734234]), 'test_score': array([0.78089888, 0.78651685, 0.78651685, 0.76966292, 0.81355932]), 'train_score': array([0.80590717, 0.80309423, 0.80028129, 0.8045007 , 0.79494382])}
[0.78089888 0.78651685 0.78651685 0.76966292 0.81355932]
[0.80590717 0.80309423 0.80028129 0.8045007  0.79494382]


In [20]:
 #KFold (basic cross-validation)
from sklearn.model_selection import KFold

kf = KFold(n_splits= 5 , shuffle = True, random_state = 42)
scores = cross_validate(model, X, y, cv = kf, return_train_score = True)
print(scores)
print(scores['test_score'])   # Test scores
print(scores['train_score']) 

#Question: significance of shufgfle in basic Kfold??

{'fit_time': array([1.25413299, 0.29655457, 0.32298446, 0.12730789, 0.39651418]), 'score_time': array([0.00474238, 0.00487208, 0.00444007, 0.00621581, 0.00520086]), 'test_score': array([0.78089888, 0.80898876, 0.80898876, 0.81460674, 0.75706215]), 'train_score': array([0.79746835, 0.80309423, 0.80590717, 0.79606188, 0.80337079])}
[0.78089888 0.80898876 0.80898876 0.81460674 0.75706215]
[0.79746835 0.80309423 0.80590717 0.79606188 0.80337079]


In [22]:
#ShuffleSplit (random train-test splits)
#Randomly splits the data n times
#Each time: train on 80%, test on 20%
#Good for very large datasets

from sklearn.model_selection import ShuffleSplit

ss = ShuffleSplit(n_splits = 5, test_size = 0.2 ,random_state = 42)
scores = cross_validate(model, X, y, cv = ss, return_train_score = True)
print(scores)
print(scores['test_score'])   # Test scores
print(scores['train_score'])

#Difference between K-fold and ShuffleSplit???

{'fit_time': array([0.5429287 , 0.15244985, 0.40941525, 0.20128846, 0.27089572]), 'score_time': array([0.00463319, 0.00460196, 0.00492239, 0.00449944, 0.00447416]), 'test_score': array([0.78089888, 0.80337079, 0.7752809 , 0.80898876, 0.71910112]), 'train_score': array([0.79746835, 0.80168776, 0.79887482, 0.80168776, 0.81997187])}
[0.78089888 0.80337079 0.7752809  0.80898876 0.71910112]
[0.79746835 0.80168776 0.79887482 0.80168776 0.81997187]


| Method            | Best For                         | Keeps class balance? | Custom train/test sizes? |
| ----------------- | -------------------------------- | -------------------- | ------------------------ |
| `cross_val_score` | Quick evaluation                 | ‚ùå (unless combined)  | ‚ùå                        |
| `cross_validate`  | Detailed results (train/test)    | ‚ùå                    | ‚ùå                        |
| `StratifiedKFold` | Classification (imbalanced data) | ‚úÖ                    | ‚ùå                        |
| `KFold`           | General purpose CV               | ‚ùå                    | ‚ùå                        |
| `ShuffleSplit`    | Custom train/test splits         | ‚ùå                    | ‚úÖ                        |


Hyperparameter Tuning

 Why?
Models like RandomForest have settings (hyperparameters) like max_depth, n_estimators, etc. Tuning = finding the best combo.

In [24]:
#lets achieve this HyperParameter Tuning using GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators':[100,200],
    'max_depth':[4, 6, 8],
    'min_samples_split':[2,5],
}

rf = RandomForestClassifier(random_state = 42)
grid_search = GridSearchCV(rf, param_grid, cv = 5 , scoring = 'accuracy')

grid_search.fit(X, y)

print("Best Params:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)


#while defining these values in param_grid are these random trial and error entries???

Best Params: {'max_depth': 6, 'min_samples_split': 5, 'n_estimators': 200}
Best CV Accuracy: 0.8155716371484797


| Tool                      | When to Use                              |
| ------------------------- | ---------------------------------------- |
| `GridSearchCV`            | ‚úÖ Try all combos (exhaustive search)     |
| `RandomizedSearchCV`      | ‚úÖ Faster for large search spaces         |
| `Optuna`, `BayesSearchCV` | Advanced, smarter searching (future use) |


In [25]:
#RandomizedSearchCV (Faster tuning than GridSearch)
#What it does:
#Instead of trying every combination (like GridSearch), it randomly samples a few combinations from the parameter grid.
#Useful when your grid is large or expensive to compute.

In [27]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': [4, 6, 8, 10, None],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestClassifier(random_state = 42)

random_search = RandomizedSearchCV(
    rf,
    param_distributions = param_dist,
    n_iter = 10,     # Try 10 random combinations
    cv = 5,
    scoring = 'accuracy',
    random_state = 42
)

random_search.fit(X, y)

print("Best Params:", random_search.best_params_)
print("Best CV Accuracy:", random_search.best_score_)

Best Params: {'max_depth': 6, 'min_samples_split': 2, 'n_estimators': 260}
Best CV Accuracy: 0.8189551196597472


In [31]:
#Optuna
!pip install optuna


Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
Downloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna

   -------------------- ------------------- 1/2 [optuna]
   -------------------- ------------------- 1/2 [optuna]
   -------------------- ------------------- 1/2 [optuna]
   -------------------- ------------------- 1/2 [optuna]
   -------------------- ------------------- 1/2 [optuna]
   -------------------- ------------------- 1/2 [optuna]
   -------------------- ------------------- 1/2 [optuna]
   -------------------- ------------------- 1/2 [optuna]
   -------------------- ------------------- 1/2 [optuna]
   -------------------- ------------------- 1/2 [optuna]
   -------------------- ------------------- 1/2 [optuna]
   -------------------- ------------------- 1

In [33]:
import optuna
from sklearn.model_selection import cross_val_score


In [None]:
#optuna: The library doing the smart hyperparameter search.
#cross_val_score: To evaluate model accuracy using cross-validation.
#RandomForestClassifier: The model we‚Äôre tuning.

In [34]:
#Why:
#Optuna works by calling this function many times (once per trial).
#Each time, it passes a trial object that you use to sample parameters.
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth = trial.suggest_int('max_depth', 4, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    #Why:
    #We‚Äôre asking Optuna to try different values:
    #n_estimators: number of trees (between 100 and 500)
    #max_depth: depth of each tree (between 4 and 20)
    #min_samples_split: minimum samples to split a node (between 2 and 10)
    #trial.suggest_int(...) tells Optuna to choose an integer in this range.
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42
    )
    #Why:
    #Create a Random Forest model using the parameters sampled by Optuna.
    #We keep random_state=42 for reproducibility (same results every time).
    score = cross_val_score(clf, X, y, cv=5, scoring='accuracy').mean()
    #Why:
    #Perform 5-fold cross-validation on the current model.
    #cross_val_score() gives us 5 accuracy values ‚Äî we take the mean as the final score for this trial.
    return score
    #Why:
    #This is the score Optuna will use to decide:
    #‚ÄúWas this parameter combination good or bad?‚Äù
    #It tries to maximize this return value (in our case, accuracy).

study = optuna.create_study(direction='maximize')
#Why:Start a study (an optimization session).
#direction='maximize' because we want to maximize accuracy.

study.optimize(objective, n_trials=20)
#Why:
#Run 20 trials (i.e., try 20 different combinations).
#It will call the objective() function 20 times with different parameters.

print("Best trial:")
print(study.best_trial)
#Why:
#After all trials, print the best combination of parameters and the score.
    


    


[I 2025-06-29 12:44:15,021] A new study created in memory with name: no-name-74fea0f1-90ed-42c8-833b-860435d09d4e
[I 2025-06-29 12:44:27,364] Trial 0 finished with value: 0.8031993905922683 and parameters: {'n_estimators': 409, 'max_depth': 19, 'min_samples_split': 7}. Best is trial 0 with value: 0.8031993905922683.
[I 2025-06-29 12:44:42,178] Trial 1 finished with value: 0.8054338856090902 and parameters: {'n_estimators': 467, 'max_depth': 15, 'min_samples_split': 3}. Best is trial 1 with value: 0.8054338856090902.
[I 2025-06-29 12:44:44,140] Trial 2 finished with value: 0.8088110201231512 and parameters: {'n_estimators': 102, 'max_depth': 9, 'min_samples_split': 10}. Best is trial 2 with value: 0.8088110201231512.
[I 2025-06-29 12:44:49,948] Trial 3 finished with value: 0.8099473116231829 and parameters: {'n_estimators': 205, 'max_depth': 14, 'min_samples_split': 6}. Best is trial 3 with value: 0.8099473116231829.
[I 2025-06-29 12:45:01,288] Trial 4 finished with value: 0.80544658160

Best trial:
FrozenTrial(number=12, state=1, values=[0.8178378721513362], datetime_start=datetime.datetime(2025, 6, 29, 12, 46, 2, 86312), datetime_complete=datetime.datetime(2025, 6, 29, 12, 46, 8, 699535), params={'n_estimators': 250, 'max_depth': 6, 'min_samples_split': 5}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=500, log=False, low=100, step=1), 'max_depth': IntDistribution(high=20, log=False, low=4, step=1), 'min_samples_split': IntDistribution(high=10, log=False, low=2, step=1)}, trial_id=12, value=None)


In [None]:
#Now that we tried different tuning methods to find best parameters lets use our findings
#from GridSearchCV to find which features actually influenced the model's decision

In [None]:
#Feature Importance (Random Forest)
#When to Use Feature Importance:
#To remove useless features
#To explain the model
#To add to your GitHub README üí™

In [40]:
best_model = grid_search.best_estimator_
importances = best_model.feature_importances_
feature_names = X.columns

feature_df = pd.DataFrame({
    'Feature': feature_names,
     'Importance': importances
}).sort_values(by = 'Importance', ascending = False)

print(feature_df)

       Feature  Importance
2          Sex    0.431339
6         Fare    0.149163
1       Pclass    0.137537
3          Age    0.108673
0  PassengerId    0.075074
4        SibSp    0.043218
5        Parch    0.027595
8   Embarked_S    0.019656
7   Embarked_Q    0.007743


In [41]:
#Saving OUR BEST MODEL

import joblib

joblib.dump(best_model,'titanic_final_model.pkl')

['titanic_final_model.pkl']

In [42]:
#How to load a model

model = joblib.load('titanic_final_model.pkl')