# Logistic Regression Code Appendix

Resources: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

Python Code:

``` Python
# Import dependency
from sklearn.linear_model import LogisticRegression

# Create the logistic regression object
log = LogisticRegression()

# Train the logistic regression model
clf = log.fit(X, y)

# Predict the target class based on p > 0.5 criteria
clf.predict(X)

# Predict the probability with the training data set
clf.predict_proba(X)

# Calculate the model fit
clf.score(X, y)
```

In [21]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [22]:
# Loading the fake pizza dataset from the web
pizza = pd.read_csv('https://jaredlander.com/data/Fake%20Pizza%20Data.csv')
pizza.head()

Unnamed: 0,Rating,CostPerSlice,HeatSource,BrickOven,Neighborhood
0,0.03,1.75,Gas,False,LittleItaly
1,4.89,2.75,Coal,True,SoHo
2,4.73,4.0,Wood,True,LittleItaly
3,0.13,1.75,Gas,False,LittleItaly
4,2.45,2.25,Wood,True,Chinatown


In [23]:
pizza.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201 entries, 0 to 200
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rating        200 non-null    float64
 1   CostPerSlice  200 non-null    float64
 2   HeatSource    200 non-null    object 
 3   BrickOven     200 non-null    object 
 4   Neighborhood  200 non-null    object 
dtypes: float64(2), object(3)
memory usage: 8.0+ KB


In [24]:
le = LabelEncoder()
pizza['BrickOven'] = le.fit_transform(pizza['BrickOven'])

In [25]:
pizza.dropna(subset = ['Rating'], axis = 0, inplace = True)

In [26]:
y = pizza['BrickOven']
X = pizza[['Rating', 'HeatSource', 'CostPerSlice']]

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [28]:
ohe = OneHotEncoder(handle_unknown = 'error', drop = 'first', sparse = False)
hs_col = ['HeatSource']
heat_source_train = ohe.fit_transform(X_train[hs_col])
heat_source_test = ohe.transform(X_test[hs_col])

columns = ohe.get_feature_names()


heat_source_train = pd.DataFrame(heat_source_train, columns = columns)
heat_source_test = pd.DataFrame(heat_source_test, columns = columns)

X_train = pd.concat([X_train.reset_index(), heat_source_train], axis = 1)

X_test = pd.concat([X_test.reset_index(), heat_source_test], axis = 1)

In [29]:
X_train = X_train.drop(columns = ['HeatSource'])
X_test = X_test.drop(columns = ['HeatSource'])

In [30]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [31]:
X_test

array([[-0.06884719, -0.35134088, -0.38438693,  0.52915026, -0.37796447],
       [-1.4204484 ,  0.79871562, -0.68035376,  0.52915026, -0.37796447],
       [-1.16702317,  0.82636121,  1.68738092,  0.52915026, -0.37796447],
       [ 0.99553877,  0.82083209,  2.57528143,  0.52915026, -0.37796447],
       [ 0.48868831,  0.21815825, -0.9763206 ,  0.52915026, -0.37796447],
       [ 0.26905312, -0.30157882, -1.56825427,  0.52915026, -0.37796447],
       [-0.50811758,  0.01911001, -0.9763206 , -1.88982237,  2.64575131],
       [ 1.19827895, -1.60092149,  0.68109368, -1.88982237,  2.64575131],
       [ 1.26585901,  0.0467556 ,  0.20754674,  0.52915026, -0.37796447],
       [-0.91359794, -1.51245561, -1.27228744,  0.52915026, -0.37796447],
       [-0.55880263,  0.03569737,  0.50351358,  0.52915026, -0.37796447],
       [ 1.40101913,  0.33979884,  1.09544725,  0.52915026, -0.37796447],
       [ 1.11380387,  0.39509002,  0.20754674,  0.52915026, -0.37796447],
       [-0.35606244,  0.31768237, -0.0

In [32]:
# # Define y
# y_train = 
# y_test =

# # Define X
# X = pizza.drop(columns=['BrickOven', 'Neighborhood'], axis=1)
# heat_dummies = pd.DataFrame(pd.get_dummies(pizza['HeatSource'], drop_first=True))
# X = pd.concat([X, heat_dummies], axis = 1)
# X = X.drop(columns = ['HeatSource'])

In [33]:
# Create the logistic regression object
log = LogisticRegression()

In [34]:
# Train the model
clf = log.fit(X_train, y_train)

In [35]:
# Predict the class of the target
clf.predict(X_test)

array([0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0])

In [36]:
# Predict the probability of the target
clf.predict_proba(X_test)

array([[0.8674775 , 0.1325225 ],
       [0.92558837, 0.07441163],
       [0.84548906, 0.15451094],
       [0.84748707, 0.15251293],
       [0.92826175, 0.07173825],
       [0.91727439, 0.08272561],
       [0.00821332, 0.99178668],
       [0.00204347, 0.99795653],
       [0.89415226, 0.10584774],
       [0.78838554, 0.21161446],
       [0.84840742, 0.15159258],
       [0.882548  , 0.117452  ],
       [0.91185753, 0.08814247],
       [0.89649101, 0.10350899],
       [0.92145697, 0.07854303],
       [0.95267405, 0.04732595],
       [0.80716317, 0.19283683],
       [0.94583824, 0.05416176],
       [0.72379701, 0.27620299],
       [0.9265967 , 0.0734033 ],
       [0.92367942, 0.07632058],
       [0.06444963, 0.93555037],
       [0.93361481, 0.06638519],
       [0.70974236, 0.29025764],
       [0.06034836, 0.93965164],
       [0.91600436, 0.08399564],
       [0.8646084 , 0.1353916 ],
       [0.75009884, 0.24990116],
       [0.76524279, 0.23475721],
       [0.79537983, 0.20462017],
       [0.

In [37]:
clf.score(X_test, y_test)

0.8

In [38]:
log_loss(y_test, clf.predict_proba(X_test))

0.482837564776135

In [39]:
# Create the logistic regression object


In [40]:
# Train the model


In [41]:
# Predict the class of the target


In [42]:
# Predict the probability of the target
