In [43]:
# why scikit learn
# building machine learning models (classification, regression, clustering)
# performing data processing (scaling, encoding, splitting data)
# evaluating models with metrics like accuracy, confusion matrix, cross-validation
# pipeline building for ML workflows

In [12]:
from sklearn.datasets import load_digits

digits = load_digits()
data = digits.images[30:70].reshape((40, -1))
target = digits.target[30:70]

In [21]:
# train test split - evaluate the generalization performance on unseen data
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(data, target, test_size=0.2, shuffle=False)
# if we give shuffle as false it will take last 20% value as for testing otherwise it will choose the training datas as randomly

In [19]:
# testing it with logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(max_iter=10000) 
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 87.50%


In [29]:
# cross validation - repeating the split such that the training and testing sets are different for each evaluation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import numpy as np

cv = KFold(n_splits=5, shuffle=False)
test_scores = cross_val_score(model, data, target, cv=cv)

print("Fold Accuracies:", test_scores)

mean_accuracy = np.mean(test_scores)
print("Average Accuracy:", mean_accuracy)

Fold Accuracies: [0.75  0.75  1.    1.    0.875]
Average Accuracy: 0.875


In [33]:
# other than KFold there are many ways to implement cross validation one of the example is shuffle split
# ShuffleSplit - where the number of splits no longer determines the size of the train and test sets.
from sklearn.model_selection import ShuffleSplit

scv = ShuffleSplit(n_splits=2, test_size=0.2, random_state=0)
stest_scores = cross_val_score(model, data, target, cv=scv)

print("Shuffle Accuracies:", stest_scores)

Shuffle Accuracies: [1. 1.]


In [41]:
# visualizing scikit-learn pipelines - chains processing and modelings steps into one object to ensure clean code, prevent data leakage
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris

# Step 1: Load the data
data = load_iris()
X = data.data
y = data.target

# Step 2: Split the data (half to train, half to test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Step 3: pipelining
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression())
])

# Step 4: fitting the pipeline
pipeline.fit(X_train, y_train)

# Step 5: make predictions
predictions = pipeline.predict(X_test)

print("Predictions:", predictions)

Predictions: [0 1 2 0 2 1 1 1 1 2 1 0 0 2 1 2 0 1 2 1 0 0 2 2 0 1 2 1 1 1]


In [None]:
# under fitting - happens when the model is too simple to learn the data well
    # Reason: Bias
    # It means the model make strong assumption and cant capture the true patterns well. ( hight bias -> overfitting )
    # to solve this issue - by using complex model, adding more features, reducing regularization, training more time.
# over fitting - happens when the model learns the training data too well, including noise and fails to generalize to new data. 
    # Reason: Variance
    # It means the model is too sensitive to the training data and learns even from the noise. ( high variance -> underfitting )
    # to solve this issue - using simpler model, adding more training data, using regularization (l1, l2), dropping noisy features.