In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# test classification dataset
from collections import Counter
from sklearn.datasets import make_classification

from numpy import mean
from numpy import std
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold

In [2]:
from sklearn.datasets import load_iris
models = dict()
iris = load_iris()

In [3]:


# Extract the features and target
X = iris.data[:, :2]  # we only take the first two features
y = iris.target

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [5]:
# Create a LogisticRegression object
multinomial_model = LogisticRegression(multi_class='multinomial', solver='saga')

In [6]:
# Fit the model to the training data
multinomial_model.fit(X_train, y_train)



In [7]:
# Evaluate the model on the testing data
score = multinomial_model.score(X_test, y_test)
#models['multinomial-simple'] = multinomial_model
print('The accuracy of the model is:', score)

The accuracy of the model is: 0.868421052631579


In [8]:
# Make predictions on new data
new_data = np.array([3.5, 5.0])
new_data = new_data.reshape(1, -1)
prediction = multinomial_model.predict(new_data)
print('The predicted class is:', prediction)

The predicted class is: [0]


In [9]:
iris_data = iris.data
iris_data = pd.DataFrame(iris_data, columns=iris.feature_names)
iris_data['species'] = iris.target 



In [10]:
# evaluate a give model using cross-validation
def evaluate_model(model, X, y):
	# define the evaluation procedure
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	# evaluate the model
	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
	return scores

In [11]:
import plotly.graph_objects as go
def compare_models(models, X,y):
    # Compute the score for each model
    scores = [mean(evaluate_model(model, X, y)) for model in models.values()]
    print(scores)
    names = list(models.keys())
    fig = go.Figure([
        go.Bar(x = names, y = scores, name="Scores"),
        ])
    return fig

In [12]:
iris_data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [13]:
iris_data['species'].unique()
features = iris.feature_names
target = 'species'

In [14]:
X_iris = iris_data[features]
y_iris = iris_data[target]

In [15]:

from sklearn import datasets,metrics
lr_iris = LogisticRegression()
lr_iris = lr_iris.fit(X, y)
y_pred = lr_iris.predict(X)
print( y_pred)
models['simple'] = lr_iris



[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 2 1 2 1 2 1 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1
 2 2 2 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 1 2 2 2 2
 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 2 2 2 1 2 2 2 1 2 2 2 1 2 2 2 1 2
 2 1]


In [16]:
print(metrics.accuracy_score(y_pred,y))

0.82


In [17]:
def calc_accuracy(y_pred,y):
    count = len(y)
    return np.sum([1  for y1, y2 in zip(y_pred, y) if y1==y2])/count
    return 
print(calc_accuracy(y_pred,y))

0.82


In [18]:
#specifying - ovr
ovr_iris = LogisticRegression(multi_class='ovr')
ovr_iris = ovr_iris.fit(X, y)
y_pred_ovr = ovr_iris.predict(X)
models['ovr'] = ovr_iris
print(metrics.accuracy_score(y_pred_ovr,y))

0.8066666666666666




In [19]:
print(calc_accuracy(y_pred_ovr,y))

0.8066666666666666


In [20]:
# specify multinomial
multi_iris = LogisticRegression(multi_class='multinomial',solver='newton-cg')
multi_iris = multi_iris.fit(X, y)
y_pred_multi = multi_iris.predict(X)
models['newton-cg'] = multi_iris
print(metrics.accuracy_score(y_pred_multi,y))


0.82




In [21]:
# using another solver with multinomial

multi_lbfgs_iris = LogisticRegression(multi_class='multinomial',solver='lbfgs')
multi_lbfgs_iris = multi_lbfgs_iris.fit(X, y)
y_pred_multi_2 = multi_lbfgs_iris.predict(X)
models['lbfgs'] = multi_lbfgs_iris
print(metrics.accuracy_score(y_pred_multi_2,y))

0.82




In [22]:
# another solver with multinomial - saga

multi_sag_iris = LogisticRegression(multi_class='multinomial',solver='sag')
multi_sag_iris = multi_sag_iris.fit(X, y)
y_pred_multi_sag = multi_sag_iris.predict(X)
models['sag'] = multi_sag_iris
print(metrics.accuracy_score(y_pred_multi_sag,y))



0.8133333333333334




In [23]:
y.shape

(150,)

In [24]:
fig = compare_models(models, X, y)
fig.update_yaxes(range=[0,1], title="scores")

[0.8133333333333334, 0.8000000000000002, 0.8133333333333334, 0.8133333333333334, 0.8022222222222223]




We are going to use a tool called GridSearchCV -- a bit of a Cross Validation Fold system to find the best values.


In [25]:
from sklearn.model_selection import GridSearchCV
import time

penalty = ['l1','l2']
max_iter=[80, 100,140]
C = np.linspace(0.1, 1.0, num=5)



In [26]:
param_grid = dict(max_iter=max_iter, C=C, penalty=penalty)

lr_iris = LogisticRegression(multi_class='multinomial',solver='saga')

grid = GridSearchCV(estimator=lr_iris, param_grid=param_grid, cv = 5)

grid_result = grid.fit(X, y)






The max_iter was reached which means the coef_ did not converge




The max_iter was reached which means the coef_ did not converge




The max_iter was reached which means the coef_ did not converge




The max_iter was reached which means the coef_ did not converge




The max_iter was reached which means the coef_ did not converge




The max_iter was reached which means the coef_ did not converge




The max_iter was reached which means the coef_ did not converge




The max_iter was reached which means the coef_ did not converge




The max_iter was reached which means the coef_ did not converge




The max_iter was reached which means the coef_ did not converge




The max_iter was reached which means the coef_ did not converge




The max_iter was reached which means the coef_ did not converge




The max_iter was reached which means the coef_ did not converge




The max_iter was reached which means the coef_ did not converge




The max_iter was reached which means the coef

In [27]:
# Summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.833333 using {'C': 0.55, 'max_iter': 80, 'penalty': 'l1'}


In [28]:
best_iris = LogisticRegression(multi_class='multinomial',solver='saga', C=0.325, max_iter= 100, penalty= 'l2')
best_iris = best_iris.fit(X, y)
y_pred_best = best_iris.predict(X)






The max_iter was reached which means the coef_ did not converge



In [29]:
print(metrics.accuracy_score(y_pred_best,y))

0.82


In [30]:
#what is the log loss function and how does it work?
from sklearn.metrics import log_loss

# Predicted probabilities
y_pred = [0.6, 0.4]

# Actual class labels
y_true = [1, 0]

# Calculate the cross-entropy loss
loss = log_loss(y_true, y_pred)

# Print the loss
print(loss)

0.5108256237659907


In [31]:
# how does the cross entropy loss work? 
import numpy as np

def cross_entropy_loss(y_true, y_pred):
  """Computes the cross entropy loss.

  Args:
    y_true: The ground truth labels.
    y_pred: The predicted labels.

  Returns:
    The cross entropy loss.
  """

  # Clip the predicted values to prevent log(0).
  y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
  print(y_pred)

  # Calculate the cross entropy loss.
  print("log of pred y: ", np.log(y_pred))
  print("y_true * log(y_pred)", y_true * np.log(y_pred))
  loss = -np.sum(y_true * np.log(y_pred))
  #print(loss)

  return loss

# Example usage:

y_true = np.array([1, 0, 0])
y_pred = np.array([0.700001, 0.200000000001, 0.10000009])

loss = cross_entropy_loss(y_true, y_pred)

print(loss)


[0.700001   0.2        0.10000009]
log of pred y:  [-0.35667352 -1.60943791 -2.30258419]
y_true * log(y_pred) [-0.35667352 -0.         -0.        ]
0.35667351536832426
