# Scikit-Learn Notebook

In [17]:
!pip install scikit-learn



In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder,MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.datasets import make_classification, make_regression
from sklearn.metrics import accuracy_score, mean_squared_error

## 1) Estimator API workflow

In [2]:
X_cls, y_cls = make_classification(
    n_samples=300, n_features=4, n_informative=3, n_redundant=0, random_state=42
)
X_cls.shape, y_cls.shape

((300, 4), (300,))

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X_cls, y_cls, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape

((240, 4), (60, 4), (240,))

In [4]:
clf = LogisticRegression(max_iter=500)
clf.fit(X_train, y_train)

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


In [5]:
preds = clf.predict(X_test)
accuracy_score(y_test, preds)

0.9

## 2) Preprocessing: scaling

In [6]:
X_train.mean(axis=0)

array([-0.51795232,  0.11246622,  0.06891482,  0.51664903])

In [7]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [8]:
X_test_scaled = scaler.transform(X_test)
X_train_scaled.mean(axis=0)

array([0.46210489, 0.57945245, 0.55549898, 0.56072348])

## 3) Encoding categorical + numeric together

In [9]:
df_mix = pd.DataFrame({
    "num1": [1.0, 2.5, 3.2, 0.7],
    "num2": [10, 20, 10, 30],
    "color": ["red", "blue", "red", "green"],
    "label": [0, 1, 0, 1]
})
X = df_mix[["num1", "num2", "color"]]
y = df_mix["label"]

numeric_cols = ["num1", "num2"]
categorical_cols = ["color"]


In [10]:
numeric_pipe = Pipeline([("scaler", StandardScaler())])
categorical_pipe = Pipeline([("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))])

In [11]:
preprocess = ColumnTransformer([
    ("num", numeric_pipe, numeric_cols),
    ("cat", categorical_pipe, categorical_cols)
])

clf_pipe = Pipeline([
    ("prep", preprocess),
    ("model", LogisticRegression(max_iter=500))
])

In [12]:
clf_pipe.fit(X, y)
clf_pipe.predict(X)

array([0, 1, 0, 1])

## 4) Regression example

In [13]:
X_reg, y_reg = make_regression(n_samples=200, n_features=3, noise=5.0, random_state=42)
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)


In [14]:
reg = LinearRegression()
reg.fit(Xr_train, yr_train)

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [15]:
reg_preds = reg.predict(Xr_test)
mean_squared_error(yr_test, reg_preds) ** 0.5  # RMSE

5.758118776795974

## 5) Cross-validation

In [16]:
cv_scores = cross_val_score(LogisticRegression(max_iter=500), X_cls, y_cls, cv=5)
print(cv_scores)
cv_scores.mean()

[0.83333333 0.86666667 0.8        0.88333333 0.85      ]


np.float64(0.8466666666666667)

## 6) GridSearchCV: Hyperparameter Tuning

In [17]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid for LogisticRegression
param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l2', 'l1'],
    'max_iter': [100, 500]
}

# Create GridSearchCV object
grid_search = GridSearchCV(
    estimator=LogisticRegression(solver='liblinear'),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy'
)

# Fit the grid search (tests all combinations)
grid_search.fit(X_train, y_train)

# Display best parameters and score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best CV Score: {grid_search.best_score_:.4f}")

Best Parameters: {'C': 0.1, 'max_iter': 100, 'penalty': 'l1'}
Best CV Score: 0.8333




In [18]:
# Get best model and make predictions
best_model = grid_search.best_estimator_
best_preds = best_model.predict(X_test)
print(f"Test Accuracy with Best Model: {accuracy_score(y_test, best_preds):.4f}")

Test Accuracy with Best Model: 0.9167


In [19]:
# View all results as DataFrame
results_df = pd.DataFrame(grid_search.cv_results_)
# Show top 5 parameter combinations by mean test score
results_df[['param_C', 'param_penalty', 'param_max_iter', 'mean_test_score']].sort_values('mean_test_score', ascending=False).head()

Unnamed: 0,param_C,param_penalty,param_max_iter,mean_test_score
1,0.1,l1,100,0.833333
3,0.1,l1,500,0.833333
5,1.0,l1,100,0.829167
7,1.0,l1,500,0.829167
0,0.1,l2,100,0.820833
