# Scikit-Learn Commands — One per line with purpose
This notebook-style script lists commonly used scikit-learn commands, one per line, each with an inline comment describing its purpose. It follows the same `#%% md` and `#%%` cell structure as the other study notebooks.


# Setup: imports and sample data

In [None]:
import numpy as np  # numerical support
from sklearn import set_config  # for display options
from sklearn.datasets import load_iris, load_diabetes, make_classification, make_regression  # toy datasets
from sklearn.model_selection import train_test_split  # split utilities

# Small datasets for examples
iris = load_iris()  # classic multiclass classification dataset
X_iris, y_iris = iris.data, iris.target  # features and target

X_cls, y_cls = make_classification(n_samples=200, n_features=10, n_informative=5, random_state=42)  # synthetic binary classification data
X_reg, y_reg = make_regression(n_samples=200, n_features=10, noise=5.0, random_state=42)  # synthetic regression data

X_train, X_test, y_train, y_test = train_test_split(X_cls, y_cls, test_size=0.2, random_state=42)  # split into train/test sets


# Datasets: loaders, fetchers, and generators
# This section shows how to load small built-in datasets, fetch larger ones (may download), and generate synthetic datasets.

In [None]:
from sklearn.datasets import (
    load_wine, load_breast_cancer, load_digits, load_diabetes as _load_diabetes, load_linnerud
)  # built-in dataset loaders (alias diabetes to avoid shadowing above variable)

wine = load_wine(as_frame=True)  # load Wine dataset; returns Bunch with .data/.target/.frame
wine.frame.head()  # view first rows as a single DataFrame when as_frame=True

X_wine, y_wine = load_wine(return_X_y=True)  # get features/target directly as arrays
breast = load_breast_cancer(as_frame=True)  # binary classification dataset (breast cancer)
digits = load_digits()  # 8x8 image digits dataset (classification)
diabetes_bunch = _load_diabetes(as_frame=True)  # regression dataset about diabetes progression
linnerud = load_linnerud(as_frame=True)  # small multivariate dataset (exercise data)

# Accessing metadata from Bunch objects (common pattern)
wine.DESCR  # long-form description of the dataset
wine.feature_names  # list of feature names


# Dataset fetchers (may download data — commented out by default)
# Uncomment the following lines in an interactive environment with internet access.

In [None]:
# from sklearn.datasets import fetch_20newsgroups, fetch_20newsgroups_vectorized  # text datasets
# from sklearn.datasets import fetch_california_housing  # tabular regression dataset
# from sklearn.datasets import fetch_openml, get_data_home, clear_data_home  # OpenML interface and data cache utils

# fetch_20newsgroups(subset='train', categories=['sci.space', 'rec.sport.baseball'], remove=('headers', 'footers', 'quotes'))  # fetch subset of 20NG text
# fetch_20newsgroups_vectorized(subset='train')  # pre-vectorized (tf-idf) 20NG features
# fetch_california_housing(as_frame=True)  # California housing regression dataset as DataFrame
# fetch_openml(name='titanic', version=1, as_frame=True)  # fetch dataset by name from OpenML (requires internet)
# fetch_openml(data_id=61, as_frame=True)  # fetch dataset by ID from OpenML
# get_data_home()  # show local scikit-learn data cache directory
# clear_data_home()  # delete all cached datasets (use with caution)


# Synthetic dataset generators (quickly create toy data)

In [None]:
from sklearn.datasets import make_blobs, make_moons, make_circles  # synthetic data generators

X_blob, y_blob = make_blobs(n_samples=200, centers=3, cluster_std=1.2, random_state=42)  # isotropic Gaussian blobs for clustering
X_moon, y_moon = make_moons(n_samples=200, noise=0.2, random_state=42)  # two interleaving half circles (binary classification)
X_circ, y_circ = make_circles(n_samples=200, factor=0.5, noise=0.05, random_state=42)  # concentric circles (binary classification)


# Global configuration and inspection

In [None]:
set_config(display='diagram')  # display pipelines/estimators as diagrams in rich environments


# Model selection: splitting and cross-validation

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold  # CV splitters
from sklearn.model_selection import cross_val_score, cross_validate  # cross-validation helpers
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV  # hyperparameter search
from sklearn.model_selection import StratifiedShuffleSplit, TimeSeriesSplit  # specialized splitters

KFold(n_splits=5, shuffle=True, random_state=42)  # K-fold CV splitter for general tasks
StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  # preserves label distribution across folds
GroupKFold(n_splits=3)  # ensures samples with the same group are in the same fold
cross_val_score(estimator=None, X=X_iris, y=y_iris, scoring=None, cv=5)  # compute CV scores (estimator to be provided)
cross_validate(estimator=None, X=X_iris, y=y_iris, scoring=['accuracy', 'f1_macro'], cv=5, return_train_score=False)  # multiple metrics CV
GridSearchCV(estimator=None, param_grid={'C': [0.1, 1, 10]}, cv=5, n_jobs=None)  # exhaustive search over param grid
RandomizedSearchCV(estimator=None, param_distributions={'C': [0.1, 1, 10]}, n_iter=5, cv=5, random_state=42)  # random hyperparam search
StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)  # random stratified splits
TimeSeriesSplit(n_splits=5)  # splitter for time-ordered data


# Preprocessing: scaling, normalization, encoding, transforms

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer  # scaling/normalization
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder  # categorical encoders
from sklearn.preprocessing import PolynomialFeatures, KBinsDiscretizer  # feature engineering
from sklearn.preprocessing import FunctionTransformer, PowerTransformer, QuantileTransformer  # transformations

StandardScaler()  # standardize features (zero mean, unit variance)
MinMaxScaler()  # scale features to a given range (default [0,1])
RobustScaler()  # scale using statistics robust to outliers (median/IQR)
Normalizer(norm='l2')  # normalize samples to unit norm
OneHotEncoder(handle_unknown='ignore', sparse_output=False)  # one-hot encode categorical features
OrdinalEncoder()  # encode categories as ordered integer codes
PolynomialFeatures(degree=2, include_bias=False)  # generate polynomial/interaction features
KBinsDiscretizer(n_bins=5, encode='onehot', strategy='quantile')  # discretize continuous features
FunctionTransformer(np.log1p, feature_names_out='one-to-one')  # wrap a NumPy/pandas function as a transformer
PowerTransformer(method='yeo-johnson')  # stabilize variance and make data more Gaussian-like
QuantileTransformer(output_distribution='normal', random_state=42)  # map features to a given distribution


# Imputation for missing data

In [None]:
from sklearn.impute import SimpleImputer, KNNImputer  # imputers

SimpleImputer(strategy='mean')  # impute missing numeric values with mean
KNNImputer(n_neighbors=5)  # impute using nearest neighbors


# Feature selection and dimensionality reduction

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, f_classif  # univariate selection
from sklearn.feature_selection import RFE, RFECV, SelectFromModel, VarianceThreshold  # model-based selection
from sklearn.decomposition import PCA, TruncatedSVD, NMF  # decomposition methods

SelectKBest(score_func=f_classif, k=10)  # select top-k features by ANOVA F-score
SelectKBest(score_func=chi2, k=10)  # select top-k features by chi-squared (non-negative features)
VarianceThreshold(threshold=0.0)  # remove features with low variance
RFE(estimator=None, n_features_to_select=5)  # recursive feature elimination with base estimator
RFECV(estimator=None, step=1, cv=5)  # RFECV selects optimal number of features via CV
SelectFromModel(estimator=None, threshold='median')  # select features based on model importance/coefficients
PCA(n_components=2, random_state=42)  # principal component analysis for dimensionality reduction
TruncatedSVD(n_components=2, random_state=42)  # SVD for sparse matrices or non-centered data
NMF(n_components=2, init='random', random_state=42)  # non-negative matrix factorization


# Composition: pipelines and column-wise processing

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline  # pipelines
from sklearn.compose import ColumnTransformer, make_column_selector  # column-wise transforms

Pipeline(steps=[('scale', StandardScaler()), ('model', None)])  # define ordered preprocessing + model steps
make_pipeline(StandardScaler())  # quick pipeline creation without naming steps
ColumnTransformer(transformers=[('num', StandardScaler(), make_column_selector(dtype_include=np.number))])  # apply transforms by column type
make_column_selector(pattern='^feat_')  # helper to select columns by regex pattern


# Linear models (classification/regression)

In [None]:
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet  # core linear models
from sklearn.linear_model import SGDClassifier, SGDRegressor, Perceptron, PassiveAggressiveClassifier  # online/large-scale

LogisticRegression(max_iter=1000, random_state=42)  # multinomial/binary classifier with regularization
LinearRegression()  # ordinary least squares regression
Ridge(alpha=1.0, random_state=42)  # L2-regularized regression
Lasso(alpha=0.001, random_state=42)  # L1-regularized regression (sparse coefficients)
ElasticNet(alpha=0.001, l1_ratio=0.5, random_state=42)  # combined L1/L2 regularization
SGDClassifier(loss='log_loss', max_iter=1000, random_state=42)  # linear classifier trained with SGD
SGDRegressor(max_iter=1000, random_state=42)  # linear regressor trained with SGD
Perceptron(max_iter=1000, random_state=42)  # linear binary classifier (Perceptron rule)
PassiveAggressiveClassifier(max_iter=1000, random_state=42)  # online large-margin classifier


# Support Vector Machines

In [None]:
from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR  # SVM estimators

SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, random_state=42)  # kernel SVM classifier
SVR(kernel='rbf', C=1.0, gamma='scale')  # kernel SVM regressor
LinearSVC(C=1.0, random_state=42)  # linear SVM classifier (efficient for large features)
LinearSVR(C=1.0, random_state=42)  # linear SVM regressor


# Tree-based models and ensembles

In [None]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor  # decision trees
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor  # random forests
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor  # gradient boosting trees
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor  # extremely randomized trees
from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor  # fast histogram-based GB

DecisionTreeClassifier(random_state=42)  # non-linear classifier tree
DecisionTreeRegressor(random_state=42)  # non-linear regression tree
RandomForestClassifier(n_estimators=200, random_state=42)  # ensemble of decision trees for classification
RandomForestRegressor(n_estimators=200, random_state=42)  # ensemble of decision trees for regression
GradientBoostingClassifier(random_state=42)  # sequential boosting of weak learners (classification)
GradientBoostingRegressor(random_state=42)  # sequential boosting of weak learners (regression)
ExtraTreesClassifier(n_estimators=200, random_state=42)  # randomized tree ensemble with extra randomness
ExtraTreesRegressor(n_estimators=200, random_state=42)  # randomized tree ensemble for regression
HistGradientBoostingClassifier(random_state=42)  # fast histogram-based gradient boosting classifier
HistGradientBoostingRegressor(random_state=42)  # fast histogram-based gradient boosting regressor


# Neighbors and Naive Bayes

In [None]:
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor, NearestNeighbors  # neighbors-based methods
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB  # naive bayes variants

KNeighborsClassifier(n_neighbors=5)  # k-NN classifier
KNeighborsRegressor(n_neighbors=5)  # k-NN regressor
NearestNeighbors(n_neighbors=5)  # unsupervised nearest neighbors queries
GaussianNB()  # Gaussian naive Bayes classifier for continuous features
MultinomialNB(alpha=1.0)  # naive Bayes for discrete counts (e.g., text)
BernoulliNB(alpha=1.0)  # naive Bayes for binary/boolean features


# Clustering

In [None]:
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, SpectralClustering  # clustering algorithms

KMeans(n_clusters=3, n_init='auto', random_state=42)  # partition data into k clusters via k-means
DBSCAN(eps=0.5, min_samples=5)  # density-based clustering finding arbitrary-shaped clusters
AgglomerativeClustering(n_clusters=3, linkage='ward')  # hierarchical clustering
SpectralClustering(n_clusters=3, random_state=42)  # graph-based clustering using spectral embeddings


# Manifold learning and embedding

In [None]:
from sklearn.manifold import TSNE, Isomap  # nonlinear manifold embeddings

TSNE(n_components=2, perplexity=30, random_state=42)  # t-SNE embedding for visualization (slow on large data)
Isomap(n_neighbors=5, n_components=2)  # isometric mapping for nonlinear dimensionality reduction


# Calibration, probability and decision functions

In [None]:
from sklearn.calibration import CalibratedClassifierCV  # probability calibration
from sklearn.preprocessing import Binarizer  # thresholding transformer

CalibratedClassifierCV(base_estimator=None, cv=5, method='isotonic')  # calibrate classifier probabilities
Binarizer(threshold=0.0)  # binarize numeric features using a threshold


# Metrics: classification, regression, clustering

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score  # basic classification metrics
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, average_precision_score  # ROC/PR metrics
from sklearn.metrics import confusion_matrix, classification_report  # confusion matrix/report
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score  # regression metrics
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score  # clustering metrics

accuracy_score(y_test, y_test)  # proportion of correct predictions
precision_score(y_test, y_test, average='binary')  # precision for positive class (for binary)
recall_score(y_test, y_test, average='binary')  # recall for positive class (for binary)
f1_score(y_test, y_test, average='binary')  # harmonic mean of precision and recall
roc_auc_score(y_test, y_test)  # area under ROC curve (needs probabilities/scores for real use)
roc_curve(y_test, y_test)  # ROC curve points (FPR, TPR, thresholds)
precision_recall_curve(y_test, y_test)  # PR curve points (precision, recall, thresholds)
average_precision_score(y_test, y_test)  # AP score summarizing PR curve
confusion_matrix(y_test, y_test)  # confusion matrix (true vs predicted)
classification_report(y_test, y_test)  # text report with precision/recall/F1 per class
mean_squared_error(y_reg, y_reg)  # MSE for regression
mean_absolute_error(y_reg, y_reg)  # MAE for regression
r2_score(y_reg, y_reg)  # coefficient of determination R^2
silhouette_score(X_iris, iris.target)  # clustering quality by cohesion/separation (needs labels)
calinski_harabasz_score(X_iris, iris.target)  # variance ratio criterion for clusters
davies_bouldin_score(X_iris, iris.target)  # average similarity between clusters


# Model fitting, predicting, scoring (example pattern)

In [None]:
from sklearn.linear_model import LogisticRegression  # example estimator

clf = LogisticRegression(max_iter=1000, random_state=42)  # instantiate classifier
clf.fit(X_train, y_train)  # fit model on training data
clf.predict(X_test)  # predict class labels for test set
clf.predict_proba(X_test)  # predict class probabilities for test set
clf.score(X_test, y_test)  # compute mean accuracy on test data
clf.get_params()  # retrieve hyperparameters as a dict
clf.set_params(C=0.5)  # set/update hyperparameters


# Inspection and explainability helpers

In [None]:
from sklearn.inspection import permutation_importance  # model-agnostic feature importance
from sklearn.inspection import PartialDependenceDisplay  # partial dependence plots

permutation_importance(clf, X_test, y_test, n_repeats=5, random_state=42)  # estimate feature importance via permutation
# PartialDependenceDisplay.from_estimator(clf, X_test, features=[0, 1])  # plot PDP for features (uncomment in notebooks)


# Text and feature extraction basics

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer  # text vectorizers
from sklearn.feature_extraction import DictVectorizer  # dict to feature matrix

CountVectorizer(max_features=5000)  # convert text corpus to token count features
TfidfVectorizer(max_features=5000)  # convert text to TF-IDF features
DictVectorizer(sparse=True)  # convert list of mapping (dict) to feature matrix


# Pipeline: typical end-to-end example (pattern)

In [None]:
from sklearn.pipeline import make_pipeline  # pipeline helper
from sklearn.preprocessing import StandardScaler  # scaler
from sklearn.svm import SVC  # classifier

make_pipeline(StandardScaler(), SVC(probability=True, random_state=42))  # pipeline chaining preprocessing and model


# ColumnTransformer: heterogeneous data processing (pattern)

In [None]:
from sklearn.compose import ColumnTransformer  # column-wise transformer
from sklearn.preprocessing import OneHotEncoder  # encoder
import pandas as pd  # for column names example

df_example = pd.DataFrame({  # small mixed-type frame
    'num1': [0.5, 1.0, 1.5],
    'cat1': ['a', 'b', 'a']
})  # example data frame with numeric and categorical

ColumnTransformer(transformers=[('num', StandardScaler(), ['num1']), ('cat', OneHotEncoder(handle_unknown='ignore'), ['cat1'])])  # apply per-column transforms


# Clustering usage pattern (fit/predict-like)

In [None]:
from sklearn.cluster import KMeans  # clustering estimator

km = KMeans(n_clusters=3, n_init='auto', random_state=42)  # instantiate k-means
km.fit(X_iris)  # learn cluster centers from data
km.labels_  # get cluster labels assigned to training data
km.cluster_centers_  # access learned cluster centers


# Persistence (save/load models) — commented to avoid I/O side effects

In [None]:
# import joblib  # persistence utility for models
# joblib.dump(clf, 'model.joblib')  # save trained model to disk
# joblib.load('model.joblib')  # load model from disk


# Utilities and miscellaneous

In [None]:
from sklearn.utils import shuffle, resample  # data utilities
from sklearn.utils.estimator_checks import check_estimator  # estimator API checks (advanced)

shuffle(X_cls, y_cls, random_state=42)  # shuffle arrays in unison
resample(X_cls, n_samples=100, replace=True, random_state=42)  # bootstrap resampling of data
# check_estimator(LogisticRegression())  # run scikit-learn estimator checks (slow; uncomment for advanced use)


# Time series and groups (splitting utilities examples)

In [None]:
from sklearn.model_selection import GroupShuffleSplit, LeaveOneGroupOut  # group-aware splitters

GroupShuffleSplit(n_splits=3, test_size=0.2, random_state=42)  # shuffle split while respecting group boundaries
LeaveOneGroupOut()  # leave-one-group-out cross-validation splitter


# Notes
1. Some commands require providing an estimator; placeholders use `None` to illustrate signatures without executing.
2. Plotting and disk I/O are commented to keep this script safe to run end-to-end.
3. Replace dataset variables (`X`, `y`) with your data and uncomment relevant lines in a Jupyter environment for interactive exploration.
4. Dataset fetchers that require network access (`fetch_*`, `fetch_openml`) are commented out to avoid downloads; uncomment when online.
