# Logistic Regression

In [2]:
#Import some example data

import pandas as pd
# target = InMichelin, whether or not a restaurant is in the Michelin guide
data = pd.read_csv("http://gattonweb.uky.edu/sheather/book/docs/datasets/MichelinNY.csv", encoding="latin_1")
print(data.head())

#update data to set up for train test split, remove Restaurant Name column
data = data.loc[:, data.columns != 'Restaurant Name']
y = data['InMichelin']
X = data.loc[:, data.columns != 'InMichelin']

   InMichelin Restaurant Name  Food  Decor  Service  Price
0           0  14 Wall Street    19     20       19     50
1           0             212    17     17       16     43
2           0        26 Seats    23     17       21     35
3           1              44    19     23       16     52
4           0               A    23     12       19     24


In [3]:
#Set up training and test data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42)

#Note: random_state ensures same data will be generated for example each time
from sklearn.linear_model import LogisticRegression

#set penalty to none since we are starting with non penalized logit, L1 and L2 are other options
logreg = LogisticRegression(penalty='none').fit(X_train, y_train)

print("logreg.coef_: {}".format(logreg .coef_))


print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))
#note - when doing model experimentation, fitting different algorithms you'd be doing cross validation with gridsearchcv or one at a time

predicted_vals = logreg.predict(X_test) # y_pred includes your predictions
print("logreg.predict: {}".format(predicted_vals))
#note - in sklearn predicted labels, not predicted probabilities, will be shown based on threshold of .5
#use predict_proba method to generate predicted probabilities

logreg.coef_: [[ 0.38181614  0.07433425 -0.15691054  0.08189853]]
Training set score: 0.797
Test set score: 0.780
logreg.predict: [0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0
 1 1 1 1]




In [None]:
logreg

#Use ?LogisticRegression() for more information

## Logistic Regression in statsmodels package

In [4]:
import statsmodels.api as sm
#remember still need to add column of 1's
X_train_new = sm.add_constant(X_train)

#remember that y is first and then X in statsmodel; Generalized Linear Model and binomial family for Logistic regression
model = sm.GLM(y_train, X_train_new, family=sm.families.Binomial()).fit()

model.summary()


0,1,2,3
Dep. Variable:,InMichelin,No. Observations:,123.0
Model:,GLM,Df Residuals:,118.0
Model Family:,Binomial,Df Model:,4.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-57.266
Date:,"Tue, 03 Oct 2023",Deviance:,114.53
Time:,17:32:25,Pearson chi2:,254.0
No. Iterations:,6,Pseudo R-squ. (CS):,0.3534
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-10.6490,2.588,-4.115,0.000,-15.722,-5.576
Food,0.3818,0.148,2.572,0.010,0.091,0.673
Decor,0.0743,0.103,0.720,0.471,-0.128,0.277
Service,-0.1569,0.147,-1.070,0.285,-0.444,0.131
Price,0.0819,0.036,2.269,0.023,0.011,0.153


## Logistic Regression with constraints on size of coefficients

In [None]:
# Smaller C will constrain Betas more.  It's a tuning parameter we can find using gridsearch.
# Lowering C will make coefficients larger, larger C will make coefficients smaller
# Note: L2 (Ridge) will shrink coefficients down, never reaching 0. L1 (Lasso) has potential to zero out coefficients

#C=100, compare coefs to regular model above.
logreg = LogisticRegression(C=100, penalty='l2').fit(X_train, y_train)

print("logreg .coef_: {}".format(logreg .coef_))


print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))


predicted_vals = logreg.predict(X_test) # y_pred includes your predictions
print("logreg.predict: {}".format(predicted_vals))

logreg .coef_: [[ 0.38171368  0.07433904 -0.15682846  0.08189077]]
Training set score: 0.797
Test set score: 0.780
logreg.predict: [0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0
 1 1 1 1]


In [None]:
#Now change to C=1, compare coefs to above models.
logreg = LogisticRegression(C=1, penalty='l2').fit(X_train, y_train)

print("logreg .coef_: {}".format(logreg .coef_))


print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))


predicted_vals = logreg.predict(X_test) # y_pred includes your predictions
print("logreg.predict: {}".format(predicted_vals))

logreg .coef_: [[ 0.37187726  0.07490079 -0.14897911  0.08113593]]
Training set score: 0.797
Test set score: 0.780
logreg.predict: [0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0
 1 1 1 1]


In [None]:

#Now make C even smaller.  Set C=.0001, compare coefs to above models.

#Does the model's prediction power get better or worse??

logreg = LogisticRegression(C=.0001, penalty='l2').fit(X_train, y_train)

print("logreg .coef_: {}".format(logreg .coef_))


print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))


predicted_vals = logreg.predict(X_test) # y_pred includes your predictions
print("logreg.predict: {}".format(predicted_vals))

logreg .coef_: [[0.00549429 0.00672568 0.00502413 0.02866617]]
Training set score: 0.699
Test set score: 0.732
logreg.predict: [0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0
 1 1 0 0]


In [None]:
#What if we want to use an l1 penalty instead?  Change penalty to 'l1' and solver to 'liblinear'.

#Does the model's prediction power get better or worse?? Do any coefficients shrink to 0 and drop out of model?
# Note: this can be helpful to understand feature importance for additional research
# Solvers are used to optimize parameters of the model.Liblinear is commonly used solver that handles both L1 and L2.

logreg = LogisticRegression(C=.01, penalty='l1',solver='liblinear').fit(X_train, y_train)

print("logreg .coef_: {}".format(logreg .coef_))


print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))


predicted_vals = logreg.predict_proba(X_test) # y_pred includes your predictions
print("logreg.predict: {}".format(predicted_vals))

logreg .coef_: [[-0.02290063  0.          0.          0.00967784]]
Training set score: 0.699
Test set score: 0.732
logreg.predict: [[0.50869302 0.49130698]
 [0.52166386 0.47833614]
 [0.50562721 0.49437279]
 [0.49796135 0.50203865]
 [0.4941774  0.5058226 ]
 [0.56184299 0.43815701]
 [0.50474107 0.49525893]
 [0.52737519 0.47262481]
 [0.49353051 0.50646949]
 [0.56596642 0.43403358]
 [0.52584631 0.47415369]
 [0.50562721 0.49437279]
 [0.529142   0.470858  ]
 [0.4858678  0.5141322 ]
 [0.44997926 0.55002074]
 [0.50869302 0.49130698]
 [0.50143551 0.49856449]
 [0.56097012 0.43902988]
 [0.49990229 0.50009771]
 [0.48256541 0.51743459]
 [0.53243515 0.46756485]
 [0.52254837 0.47745163]
 [0.43367605 0.56632395]
 [0.51288298 0.48711702]
 [0.50627411 0.49372589]
 [0.49659665 0.50340335]
 [0.49990229 0.50009771]
 [0.51046484 0.48953516]
 [0.51683215 0.48316785]
 [0.4858678  0.5141322 ]
 [0.51618587 0.48381413]
 [0.4858678  0.5141322 ]
 [0.47991044 0.52008956]
 [0.53331756 0.46668244]
 [0.54685961 0.4531

## Multiclass models (Multinomial model) - Where Dependent Variables are more than TWO!

In [17]:
from sklearn.datasets import load_iris
import numpy as np

##Three categories for the dependent variable - different iris flower types - disetosa, versicolor, virginica
iris = load_iris()
iris
X, y = iris.data, iris.target

print(iris.feature_names )# X variable names
print(X[0:5]) # first five rows of data

print(iris.target_names) #target categories
print(np.unique(y)) #target values

# print(iris.keys()) - load gives you a didctionary, not a dataframe 


dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])


In [None]:
logreg = LogisticRegression(multi_class="multinomial",solver="lbfgs",max_iter=10000).fit(X,y)
#Note that model is being fit to entire dataset X,y but you should always be using some form of cross validation to find the best model
#Note the three argument changes to LogisticRegression()  Need to set to multinomial, change algorithm to lbfgs for iterating for multinomial, increase iterations to make sure find best coefficients

In [None]:
print(logreg.predict(X)) #run predicted probabilities through softmax function to predict categories for new X data, but I am being lazy and using X data here.

[0 0 0 1 0 0 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 1 1 1
 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 1 0 0 0 0 1 1 0 1 1 0 1 0
 0 0 1 1 1 0 0 0 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 0 1 0 1 0 0 0 0
 1 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 1 1 0 0 1
 1 0 1 1 0 1 0 0 0 0 0 1 1 1 0 1]


### Softmax

In [None]:
# Softmax used for predicted probabilities.Here's how Softmax is calculated with some example data...
# transform values into probabilities
# for now we use Softmax to help extend into multinomial logit, but will use towards end of class for neural networks
from math import exp
# calculate each probability
p1 = exp(1) / (exp(1) + exp(3) + exp(2))
p2 = exp(3) / (exp(1) + exp(3) + exp(2))
p3 = exp(2) / (exp(1) + exp(3) + exp(2))
# report probabilities
print(p1, p2, p3)
# report sum of probabilities
print(p1 + p2 + p3)

0.09003057317038046 0.6652409557748219 0.24472847105479767
1.0


# Extra Practice for Logit Regression
##Theory Questions 1-5
##Coding Questions 6-9

###Question 1:
What is logistic regression, and how does it differ from linear regression?

- Logistic Regression gives us the probability that lies between 0 and 1 (while linear gives us the numeric value)
- Logistic regression is used for binary outcomes (classification!)

###Question 2:
Describe the process of splitting a dataset into training and testing sets for logistic regression?

- 1) Define X and Y (Dependent) 
- 2) We then split them into 80-20 (customized on your end)
- 3) Fit those train data into logistic regression model 

###Question 3:
What is regularization in logistic regression, and why is it important?

###Question 4:
Walk me through the process of fitting a logistic regression model in Python using a library scikit-learn.

###Question 5:
Explain the process of hyperparameter tuning for a logistic regression model.

###Question 6-9:
In this series of questions you will be asked to find the optimal hyperparameters for a dataset of your choosing and running a logistic regression model, including the regularization strength (C) and the penalty type (L1 or L2). Each question tests a differst part of what makes up the optimal logistic regression model for this dataset. In my question and answer I chose to do it on the Iris Dataset.

###Question 6:
Tune the regulation strength (C) of the model. Hint: list out the possible values of C, such that [0.001, 0.01, 0.1, 1. etc...]. Make sure to import GridSearchCV

###Question 7:
Now set up a hyperparamter grid where you explore the two penalty types for the model [L1 & L2] along with liblinear solver.

##Question 8:
 Now set up and execute a Gridsearch for the best hyperparameters, fit the best model, and retrieve the optimal hyperparameters using scikit-learn.

###Question 9:
Print the results stating which is the best logistic regression model, traning set scire, test set score, and the best logistic regression prediction.