In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

In [2]:
df = pd.read_csv('mushroom_ft_engineer.csv', index_col = 0)
df2 = pd.read_csv('le_mushroom_ft_engineer.csv', index_col = 0)

In [3]:
y = df2['class']
X = df2.drop(['class', 'spore-print-color', 'veil-type'] , axis = 1)

# Split X and y with even class distributions
# stratify to create randomness in your data but one that also has the same propoertoins across train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42, stratify=y)

In [5]:
#Scale the data
ss = StandardScaler()
ss.fit(X_train)
X_train = pd.DataFrame(ss.transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(ss.transform(X_test), columns=X_test.columns)

In [6]:
log = LogisticRegression(random_state=42)
print(log.fit(X_train, y_train))
log.score(X_test, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)




0.9987692307692307

In [8]:
#Now with cross validation search
val = cross_val_score(LogisticRegression(random_state=42),X_train,y_train,cv=5)
val.mean()



0.9989234314977065

### Log_reg w Grid

In [25]:
log_param_grid = {
    "max_iter": [10, 50, 100],
    "penalty": ['l2'],
    "class_weight": ['balanced'],
    "solver": ['lbfgs', 'saga']
}


log_grid = LogisticRegression()
gridsearch = GridSearchCV(log_grid, log_param_grid, cv=3, return_train_score=True, n_jobs=-1, verbose=-1)

gridsearch.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   42.2s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'class_weight': ['balanced'],
                         'max_iter': [10, 50, 100], 'penalty': ['l2'],
                         'solver': ['lbfgs', 'saga']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring=None, verbose=-1)

In [26]:
print("Testing Accuracy: {:.4}%".format(gridsearch.best_score_ * 100))
print("")
print("Optimal Parameters: {}".format(gridsearch.best_params_))

# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print("Best Model: {}".format(gridsearch.best_estimator_))

Testing Accuracy: 99.92%

Optimal Parameters: {'class_weight': 'balanced', 'max_iter': 50, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Model: LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=50, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


In [27]:
gridsearch.score(X_test, y_test)

0.9987692307692307

In [28]:
# logistic regression
log = LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=50, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

print(log.fit(X_train, y_train))

log.score(X_test, y_test)


#Now with cross validation search
val = cross_val_score(LogisticRegression(random_state=42),X_train,y_train,cv=5)
val.mean()

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=50, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)




0.9989234314977065

In [31]:
list(zip(X.columns, log.coef_[0]))

[('cap-shape', -0.16377591809764602),
 ('cap-surface', -0.15687228990206212),
 ('cap-color', 0.23286869742365826),
 ('bruises', -0.8701208167145624),
 ('odor', 0.6493703013109933),
 ('gill-attachment', 0.13468551115642408),
 ('gill-spacing', -1.5196357909539586),
 ('gill-size', 2.353178512919073),
 ('gill-color', 0.8753421696539747),
 ('stalk-shape', -0.8766148635322691),
 ('stalk-root', -0.18064279461554245),
 ('stalk-surface-above-ring', -0.6176861532066152),
 ('stalk-surface-below-ring', 0.2079228383341673),
 ('stalk-color-above-ring', -0.5029975925218279),
 ('stalk-color-below-ring', -0.08001155169131152),
 ('veil-color', 0.6152655887496965),
 ('ring-number', -1.5264788856046099),
 ('ring-type', -0.9191041570561778),
 ('population', -1.639771396857332),
 ('habitat', 0.3933385916418122),
 ('human_interference', -0.24043275439946032),
 ('offensive_odor', -6.8054391114374635),
 ('c_molybdites', 1.722546625377947),
 ('non_toxic_spore_color', -0.1266010372954467),
 ('prob_tox', 0.0),
 (