# Logistic Regression

In [None]:
#Import some example data

import pandas as pd
# target = InMichelin, whether or not a restaurant is in the Michelin guide
data = pd.read_csv("http://gattonweb.uky.edu/sheather/book/docs/datasets/MichelinNY.csv" , encoding="latin_1")
data.head()

#update data to set up for train test split
data = data.loc[:, data.columns != 'Restaurant Name']
y = data['InMichelin']
X= data.loc[:, data.columns != 'InMichelin']

In [None]:
#Set up training and test data
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) 

#Note: random_state ensures same data will be generated for example each time

#Note: logistic regression in sklearn is preset to be a regularization model with C=100).
#If you make C really high the model effectively becomes a logistic regression model...

logreg = LogisticRegression(C=1e90).fit(X_train, y_train)

print("logreg .coef_: {}".format(logreg .coef_))


print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))


predicted_vals = logreg.predict(X_test) # y_pred includes your predictions
print("logreg.predict: {}".format(predicted_vals))



logreg .coef_: [[ 0.38181614  0.07433425 -0.15691054  0.08189853]]
Training set score: 0.797
Test set score: 0.780
logreg.predict: [0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0
 1 1 1 1]


In [None]:
logreg

#Use ?LogisticRegression() for more information

LogisticRegression(C=1e+90, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Logistic Regression in statsmodels package

In [None]:
import statsmodels.api as sm

X_train_new = sm.add_constant(X_train)

model = sm.GLM(y_train, X_train_new, family=sm.families.Binomial()).fit()

model.summary()


  import pandas.util.testing as tm


0,1,2,3
Dep. Variable:,InMichelin,No. Observations:,123.0
Model:,GLM,Df Residuals:,118.0
Model Family:,Binomial,Df Model:,4.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-57.266
Date:,"Thu, 11 Feb 2021",Deviance:,114.53
Time:,14:26:08,Pearson chi2:,254.0
No. Iterations:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-10.6490,2.588,-4.115,0.000,-15.722,-5.576
Food,0.3818,0.148,2.572,0.010,0.091,0.673
Decor,0.0743,0.103,0.720,0.471,-0.128,0.277
Service,-0.1569,0.147,-1.070,0.285,-0.444,0.131
Price,0.0819,0.036,2.269,0.023,0.011,0.153


## Logistic Regression with constraints on size of coefficients

In [None]:
# Smaller C will constrain Betas more.  It's a tuning parameter we can find using gridsearch.

#C=100, compare coefs to regular model above.
logreg = LogisticRegression(C=100).fit(X_train, y_train)

print("logreg .coef_: {}".format(logreg .coef_))


print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))


predicted_vals = logreg.predict(X_test) # y_pred includes your predictions
print("logreg.predict: {}".format(predicted_vals))

logreg .coef_: [[ 0.38171368  0.07433904 -0.15682846  0.08189077]]
Training set score: 0.797
Test set score: 0.780
logreg.predict: [0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0
 1 1 1 1]


In [None]:

#C=1, compare coefs to above models.
logreg = LogisticRegression(C=.001).fit(X_train, y_train)

print("logreg .coef_: {}".format(logreg .coef_))


print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))


predicted_vals = logreg.predict(X_test) # y_pred includes your predictions
print("logreg.predict: {}".format(predicted_vals))

logreg .coef_: [[0.02661388 0.02414286 0.01347687 0.06925635]]
Training set score: 0.748
Test set score: 0.780
logreg.predict: [0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0
 1 1 1 1]


In [None]:
logreg

LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:

#C=.01, compare coefs to above models.

#Does the model's prediction power get better or worse??

logreg = LogisticRegression(C=.01).fit(X_train, y_train)

print("logreg .coef_: {}".format(logreg .coef_))


print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))


predicted_vals = logreg.predict(X_test) # y_pred includes your predictions
print("logreg.predict: {}".format(predicted_vals))

logreg .coef_: [[0.12347623 0.06179923 0.00296359 0.07729161]]
Training set score: 0.797
Test set score: 0.780
logreg.predict: [0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0 0 0
 1 1 1 1]


### We can also change the default in the penalty argurment from penalty='l2' to penalty='l1' to adjust our regularization constraints:

?LogisticRegression()

#  Challenge:  How would you use GridsearchCV to tune the C parameter?


In [None]:
# Here is some example code from our knn tutorial for reference:

from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

#create dictionary data object with keys equal to parameter name 'n_neighbors' 
#for knn model and values equal to range of k values to create models for

param_grid = {'n_neighbors': np.arange(1, 15, 2)} #np.arange creates sequence of numbers for each k value

grid = GridSearchCV(KNeighborsClassifier(), param_grid=param_grid, cv=10)

#use meta model methods to fit score and predict model:
grid.fit(X_train, y_train)

#extract best score and parameter by calling objects "best_score_" and "best_params_"
print("best mean cross-validation score: {:.3f}".format(grid.best_score_))
print("best parameters: {}".format(grid.best_params_))
print("test-set score: {:.3f}".format(grid.score(X_test, y_test)))


best mean cross-validation score: 0.813
best parameters: {'n_neighbors': 13}
test-set score: 0.756


## Multiclass models (Multinomial model)

In [None]:
from sklearn.datasets import load_iris
import numpy as np

iris = load_iris()
iris
X, y = iris.data, iris.target

print(iris.feature_names )# X variable names
print(X[0:5]) # first five rows of data

print(iris.target_names) #target categories
print(np.unique(y)) #target values



['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
['setosa' 'versicolor' 'virginica']
[0 1 2]


In [None]:
logreg = LogisticRegression(C=1e90,multi_class="multinomial",solver="lbfgs").fit(X,y) #Note the argument changes to LogisticRegression()

In [None]:
print(logreg.predict(X)) #uses softmax function to predict new X data, but I am being lazy and using X data here.

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
