In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

In [2]:
df = pd.read_csv('mushroom_ft_engineer.csv', index_col = 0)
df2 = pd.read_csv('le_mushroom_ft_engineer.csv', index_col = 0)

In [3]:
y = df2['class']
X = df2.drop(['class', 'spore-print-color', 'veil-type'] , axis = 1)

# Split X and y with even class distributions
# stratify to create randomness in your data but one that also has the same propoertoins across train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42, stratify=y)

In [5]:
#Scale the data
ss = StandardScaler()
ss.fit(X_train)
X_train = pd.DataFrame(ss.transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(ss.transform(X_test), columns=X_test.columns)

In [6]:
log = LogisticRegression(random_state=42)
print(log.fit(X_train, y_train))
log.score(X_test, y_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)




0.9987692307692307

In [7]:
X_train

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,ring-type,population,habitat,human_interference,offensive_odor,c_molybdites,non_toxic_spore_color,prob_tox,prob_ed,no_rings
0,0.567684,0.221589,1.359774,0.844107,-0.394699,0.159888,-0.437225,-0.662169,-0.229030,-1.141838,...,-0.164559,1.081015,-1.271931,-0.511802,-1.068010,-0.09734,-0.151064,0.0,0.29798,-0.06459
1,-0.543240,1.066229,0.196567,-1.184684,0.617793,0.159888,-0.437225,-0.662169,1.270230,-1.141838,...,-1.276853,-2.117126,0.699610,1.953879,0.936321,-0.09734,-0.151064,0.0,0.29798,-0.06459
2,-2.765088,1.066229,-0.385036,0.844107,0.617793,0.159888,2.287152,-0.662169,1.570082,-1.141838,...,0.947735,-0.518055,-1.271931,-0.511802,0.936321,-0.09734,-0.151064,0.0,0.29798,-0.06459
3,0.567684,1.066229,-1.257441,0.844107,1.630286,0.159888,-0.437225,1.510188,-0.828734,0.875781,...,-1.276853,0.281480,1.093919,-0.511802,-1.068010,-0.09734,-0.151064,0.0,0.29798,-0.06459
4,1.678608,0.221589,-1.257441,0.844107,-0.394699,0.159888,-0.437225,1.510188,-0.828734,0.875781,...,-1.276853,0.281480,-0.089006,1.953879,-1.068010,-0.09734,-0.151064,0.0,0.29798,-0.06459
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6494,-0.543240,1.066229,-1.257441,0.844107,1.630286,0.159888,-0.437225,1.510188,-0.828734,0.875781,...,-1.276853,0.281480,1.093919,-0.511802,-1.068010,-0.09734,-0.151064,0.0,0.29798,-0.06459
6495,-0.543240,0.221589,-1.257441,0.844107,0.617793,0.159888,2.287152,1.510188,1.570082,-1.141838,...,-1.276853,0.281480,-0.877623,-0.511802,0.936321,-0.09734,-0.151064,0.0,0.29798,-0.06459
6496,0.567684,-1.467689,1.068973,0.844107,0.617793,0.159888,2.287152,-0.662169,0.670526,0.875781,...,-1.276853,-2.916661,-1.271931,-0.511802,0.936321,-0.09734,-0.151064,0.0,0.29798,-0.06459
6497,-0.543240,-1.467689,-0.385036,-1.184684,0.617793,0.159888,-0.437225,-0.662169,1.570082,0.875781,...,0.947735,1.081015,1.093919,-0.511802,0.936321,-0.09734,-0.151064,0.0,0.29798,-0.06459


In [8]:
#Now with cross validation search
val = cross_val_score(LogisticRegression(random_state=42),X_train,y_train,cv=5)
val.mean()



0.9989234314977065

In [24]:
log_param_grid = {
    "max_iter": [10, 50, 100],
    "penalty": ['l1', 'l2'],
    "class_weight": ['balanced'],
    "solver": ['lbfgs', 'saga']
}


log_grid = LogisticRegression()
gridsearch = GridSearchCV(log_grid, log_param_grid, cv=3, return_train_score=True, n_jobs=-1, verbose=-1)

gridsearch.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

In [17]:
print("Testing Accuracy: {:.4}%".format(gridsearch.best_score_ * 100))
print("")
print("Optimal Parameters: {}".format(gridsearch.best_params_))

# Actual model object fit with those best parameters
# Shows default parameters that we did not specify
print("Best Model: {}".format(gridsearch.best_estimator_))

Testing Accuracy: 99.92%

Optimal Parameters: {'class_weight': 'balanced', 'max_iter': 10, 'penalty': 'l1'}
Best Model: LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=10, multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)


In [18]:
gridsearch.score(X_test, y_test)

0.9993846153846154