In [1]:
#  Save Models in Scikit-Learn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np

# Your training data
X_train = np.array([[25, 50000], [45, 80000], [35, 60000], [50, 95000]])
y_train = np.array([0, 1, 0, 1])

# Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

At this point, you have a trained model stored in the variable model. But this only exists in your computer's memory while your program is running. To save it to disk, you'll use joblib

In [2]:
import joblib

# Save the model to a file
joblib.dump(model, 'customer_prediction_model.pkl') 
#Model Saving: Uses joblib.dump() to save the trained model to a file called customer_prediction_model.pkl

['customer_prediction_model.pkl']

That's it! You've just saved your model to a file called customer_prediction_model.pkl. The .pkl extension stands for "pickle," which is the name of Python's serialization format. This file now contains everything your model learned during training.

In [3]:
# Loading Your Saved Model
import joblib

# Load the model from the file
# Model Loading: Shows how to load the saved model back from the file using joblib.load()
loaded_model = joblib.load('customer_prediction_model.pkl')# joblib.load

# Now you can use it to make predictions
new_customer = [[30, 55000]]  # Age 30, income $55,000
prediction = loaded_model.predict(new_customer)
print(prediction)  # Will output 0 or 1

[0]


**The area you have selected appears to be part of the model saving/loading section. This is a common workflow in machine learning where you**:

1. Train a model once (which can be time-consuming)
2. Save it to disk so you don't have to retrain it every time
3. Load it later to make predictions on new data
4. The .pkl file extension stands for "pickle" - Python's built-in serialization format that allows you to store Python objects (like your trained model) to disk and reload them later

In [4]:
# A Complete Example
# A Complete Example that includes training, saving, loading, and predicting
from sklearn.datasets import load_iris  # is a built-in dataset in scikit-learn
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import joblib

# load dataset
iris = load_iris()
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size= 0.2, random_state= 42)

# train model 
model = DecisionTreeClassifier(random_state = 42)
model.fit(x_train, y_train)

# cehck its accuracy
accuracy = model.score(x_test, y_test)
print(f"Model accuracy: {accuracy}")

# save the model
joblib.dump(model, 'iris_classifier.pkl')
print("Model saved successfully.")




Model accuracy: 1.0
Model saved successfully.


In [5]:
# Then, perhaps days later or in a different script entirely, you load and use the model:
import joblib
import numpy as np
# load the saved model
model= joblib.load('iris_classifier.pkl')

# make a prediction
new_flower = np.array([[5.1,3.5,1.5,0.2]])  # Example flower measurements
prediction = model.predict(new_flower)
print(f"this flower is predicted to be type: {prediction[0]}")



this flower is predicted to be type: 0


## When you save a trained scikit-learn model using joblib, what exactly gets saved in the file?
All the learned parameters and patterns from training, such as coefficients, weights, or tree structures. 

Excellent! The saved model contains all the results of training — the weights, coefficients, tree structures, or whatever parameters the algorithm learned. This is why you can immediately make predictions without retraining.

In [6]:
# Create sample customer churn data for the example
import numpy as np

# Customer features: [age, income] 
X_train_churn = np.array([
    [25, 50000], [45, 80000], [35, 60000], [50, 95000],
    [28, 55000], [42, 75000], [38, 65000], [55, 100000],
    [30, 52000], [48, 85000], [33, 58000], [52, 92000]
])

# Churn labels: 0 = stayed, 1 = churned
y_train_churn = np.array([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1])

print(f"Customer churn data shape: X={X_train_churn.shape}, y={y_train_churn.shape}")
print("Data ready for training!")

Customer churn data shape: X=(12, 2), y=(12,)
Data ready for training!


In [7]:
# Saving file in different versions
import joblib
from sklearn.ensemble import RandomForestClassifier

# Train the first model using the customer churn data
model = RandomForestClassifier(random_state=42)
model.fit(X_train_churn, y_train_churn)

# Save the model with version number
version = 1
joblib.dump(model, f'customer_churn_model_v{version}.joblib')
print(f"Model version {version} saved successfully.")
print(f"Model trained on {len(X_train_churn)} customer records")

Model version 1 saved successfully.
Model trained on 12 customer records


In [10]:
# When we train a new version, just increment the version number
import joblib
from sklearn.ensemble import RandomForestClassifier

# Train version 2 with different hyperparameters (more trees)
model_v2 = RandomForestClassifier(n_estimators=200, random_state=42)
model_v2.fit(X_train_churn, y_train_churn)  # Use the same data but different model settings

# Save the new version
version = 2
joblib.dump(model_v2, f'customer_churn_model_v{version}.joblib')
print(f"Model version {version} saved successfully.")
print(f"Model v{version} trained with {model_v2.n_estimators} trees (vs 100 in v1)")

Model version 2 saved successfully.
Model v2 trained with 200 trees (vs 100 in v1)
