In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv("covid_toy.csv")
df.head(3)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No


In [5]:
df.isnull().sum()

age           0
gender        0
fever        10
cough         0
city          0
has_covid     0
dtype: int64

In [7]:
from sklearn.impute import SimpleImputer
si=SimpleImputer()
df["fever"]=si.fit_transform(df[["fever"]])


In [8]:
df.isnull().sum()

age          0
gender       0
fever        0
cough        0
city         0
has_covid    0
dtype: int64

In [11]:
import pandas as pd
df = pd.get_dummies(df, columns=["gender", "cough", "city"],drop_first=True)

In [13]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df["has_covid"]=le.fit_transform(df["has_covid"])

In [15]:
df.head(3)

Unnamed: 0,age,fever,has_covid,gender_Male,cough_Strong,city_Delhi,city_Kolkata,city_Mumbai
0,60,103.0,0,True,False,False,True,False
1,27,100.0,1,True,False,True,False,False
2,42,101.0,0,True,False,True,False,False


In [17]:
x=df.drop(columns="has_covid")
y=df["has_covid"]

In [19]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [23]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
y_pred=lr.predict(x_test)

In [31]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.55

In [33]:
param_grid = [
    {'penalty':['l1','l2','elasticnet','none'],
    'C' : np.logspace(-4,4,20),
    'solver': ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter'  : [10,50,100,150,500,1000,2000,3000]
}
]

#param_grid is just a dictionary of hyperparameters you want GridSearchCV to test.
# 🔹 1. penalty
# Controls regularization type (to prevent overfitting).
# Options:
# 'l1' → Lasso (forces some coefficients to become 0 → feature selection).
# 'l2' → Ridge (shrinks coefficients but doesn’t make them zero).
# 'elasticnet' → Combination of L1 and L2.
# 'none' → No regularization.
# When to tune: If model is overfitting or dataset is high-dimensional.

# You include all penalty options in param_grid because:
# You don’t know beforehand which regularization will give the best performance.
# GridSearchCV will try all valid combinations of penalty + solver + C and find the best model.
#param_grid is just a common convention name


# 2. C
# Inverse of regularization strength (think of it as “tolerance”).
# Small C → Strong regularization → simpler model.
# Large C → Weak regularization → more complex model.
# When to tune: Always, since it directly affects bias-variance tradeoff.

# C=1/lambda
# λ (lambda) = regularization strength
# C = inverse of regularization strength

# 3. solver
# Optimization algorithm used to fit Logistic Regression.
# When to tune: If your dataset is large/multiclass or if you use specific penalties.
# | **Solver**    | **Best For**                              | **Penalty Support**      | **Notes**                                          |
# | ------------- | ----------------------------------------- | ------------------------ | -------------------------------------------------- |
# | **liblinear** | Small datasets, binary/multiclass (OvR)   | L1, L2                   | Slower for large data, but good for L1.            |
# | **lbfgs**     | Larger datasets, multiclass (multinomial) | L2, None                 | Default solver, very stable.                       |
# | **newton-cg** | Large datasets, multiclass (multinomial)  | L2, None                 | Similar to lbfgs, but can be slower.               |
# | **sag**       | Very large datasets with many features    | L2, None                 | Faster than lbfgs on big data, needs data scaling. |
# | **saga**      | Large datasets, supports multinomial      | L1, L2, ElasticNet, None | Most flexible, supports all penalties.             |


# 4. max_iter:
# max_iter sets the maximum number of iterations the optimization algorithm (solver) will run to find the best coefficients.
# Sometimes the model doesn’t converge with default (100).
# Increase it (500, 1000, etc.) when you see a “convergence warning”.



# 5. class_weight 
# class_weight: tells the model how much importance (weight) to give each class while training.
# Deals with imbalanced datasets.
# When to tune: If one class dominates (e.g., 95% No, 5% Yes).
# None (default):
# → All classes are treated equally.
# 'balanced':
# → Model automatically gives more weight to minority classes and less weight to majority classes, based on class frequencies.

In [39]:
from sklearn.model_selection import GridSearchCV
gridsearch = GridSearchCV(lr,param_grid = param_grid, cv = 9, verbose=True,n_jobs=-1)
gridsearch

# verbose=True
# Verbose means: “show me progress messages while running.”
# Higher numbers (verbose=2,3…) → more detailed logs.
# True is same as 1 (basic progress info).

# n_jobs=-1
# n_jobs = how many CPU cores to use for running in parallel.
# -1 means use all available cores → faster training.
# Example: If your laptop has 8 cores, it will use all 8 at once.

# n_jobs=1
# Means use only 1 CPU core for the task.
# safer to use for small machine, uses less resource

In [41]:
bestgridsearch = gridsearch.fit(x,y)
bestgridsearch.best_estimator_

# Why your sir used fit(X, y) instead of fit(X_train, y_train)
# He probably used the entire dataset (X, y) just to demonstrate GridSearchCV.
# Using the full dataset means the model sees all the data during hyperparameter tuning.
# That’s okay for teaching/demo purposes, but in real practice it’s risky → because the model may overfit (learn patterns from test data it should not see).

# Correct practice
# Always split into X_train, X_test, y_train, y_test
# Use only training set for fit()
# bestgridsearch = gridsearch.fit(X_train, y_train)



# It is an attribute of all hyperparameter search objects in scikit-learn, including:
# GridSearchCV
# RandomizedSearchCV
# HalvingGridSearchCV
# HalvingRandomSearchCV

# .best_estimator_ → gives you the final tuned model with the best hyperparameters.

Fitting 9 folds for each of 3200 candidates, totalling 28800 fits


18720 fits failed out of a total of 28800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1440 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\shaba\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\shaba\anaconda3\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\shaba\anaconda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1172, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [47]:
print(f'Accuracy : {bestgridsearch.score(x,y):.3f}')

Accuracy : 0.580


In [None]:
# what is precision,recall,f1_score,confusion_matrix... in which scenario we have to use which type of classification performance evaluation metrics.
# why logistic regression algorithm is considered as logistic regression whenever it is used for classification?