In [2]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import linear_model, decomposition, datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
cols = ['the variance of Wavelet Transformed image', 'skewness of Wavelet Transformed image', 'curtosis of Wavelet Transformed image', 'the entropy of image', 'class']
df = pd.read_csv("data_banknote_authentication.txt",names=cols,header=None)

In [6]:
#df

In [7]:
df = sklearn.utils.shuffle(df)

In [8]:
df.head()

Unnamed: 0,the variance of Wavelet Transformed image,skewness of Wavelet Transformed image,curtosis of Wavelet Transformed image,the entropy of image,class
732,-2.7143,11.4535,2.1092,-3.9629,0
60,2.888,0.44696,4.5907,-0.24398,0
1179,-3.2778,1.8023,0.1805,-2.3931,1
828,-2.5912,-0.10554,1.2798,1.0414,1
1018,-0.40804,0.54214,-0.52725,0.6586,1


In [9]:
df['class'].value_counts()

0    762
1    610
Name: class, dtype: int64

In [10]:
df.skew()

the variance of Wavelet Transformed image   -0.149388
skewness of Wavelet Transformed image       -0.394103
curtosis of Wavelet Transformed image        1.088569
the entropy of image                        -1.022243
class                                        0.223191
dtype: float64

In [11]:
df.isnull().sum()

the variance of Wavelet Transformed image    0
skewness of Wavelet Transformed image        0
curtosis of Wavelet Transformed image        0
the entropy of image                         0
class                                        0
dtype: int64

In [12]:
df.corr()

Unnamed: 0,the variance of Wavelet Transformed image,skewness of Wavelet Transformed image,curtosis of Wavelet Transformed image,the entropy of image,class
the variance of Wavelet Transformed image,1.0,0.264026,-0.38085,0.276817,-0.724843
skewness of Wavelet Transformed image,0.264026,1.0,-0.786895,-0.526321,-0.444688
curtosis of Wavelet Transformed image,-0.38085,-0.786895,1.0,0.318841,0.155883
the entropy of image,0.276817,-0.526321,0.318841,1.0,-0.023424
class,-0.724843,-0.444688,0.155883,-0.023424,1.0


In [13]:
X = df.drop('class',axis=1)
y = df['class']

In [14]:
X_train, X_test,y_train,y_test = train_test_split(X,y,train_size = 0.7,random_state=33)

In [15]:
#Creating pipeline
# Create an scaler object
sc = StandardScaler()

# Create a logistic regression object with an L2 penalty
logistic = linear_model.LogisticRegression()

pipe = Pipeline(steps=[('sc', sc),('logistic', logistic)])

# Create a list of values of the regularization parameter
C = np.logspace(-4, 4, 50)
# Create a list of options for the regularization penalty
penalty = ['l1', 'l2']
# Create a dictionary of all the parameter options 
# Note has you can access the parameters of steps of a pipeline by using '__’
parameters = dict(logistic__C=C,
                  logistic__penalty=penalty)
# Conduct Parameter Optmization With Pipeline
# Create a grid search object
clf = GridSearchCV(pipe, parameters)

In [16]:
# Fit the grid search
clf.fit(X_train, y_train)
# View The Best Parameters
print('Best Penalty:', clf.best_estimator_.get_params()['logistic__penalty'])
print('Best C:', clf.best_estimator_.get_params()['logistic__C'])
#print('Best Number Of Components:', clf.best_estimator_.get_params()['pca__n_components'])
print(); print(clf.best_estimator_.get_params()['logistic'])

Best Penalty: l1
Best C: 7.9060432109076855

LogisticRegression(C=7.9060432109076855, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)


In [17]:
y_pred = clf.predict(X_test)

In [19]:
#y_pred

In [20]:
print ('\n Best score: \n', clf.best_score_)
print ('\n Best parameters set: \n')
best_parameters = clf.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print ('\t%s: %r' % (param_name, best_parameters[param_name]))
print ("\n Confusion Matrix on Test data\n",confusion_matrix(y_test,y_pred))
print ("\n Test Accuracy \n",accuracy_score(y_test,y_pred))
print ("\nPrecision Recall f1 table \n",classification_report(y_test,y_pred))


 Best score: 
 0.990625

 Best parameters set: 

	logistic__C: 7.9060432109076855
	logistic__penalty: 'l1'

 Confusion Matrix on Test data
 [[216   3]
 [  2 191]]

 Test Accuracy 
 0.9878640776699029

Precision Recall f1 table 
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       219
           1       0.98      0.99      0.99       193

    accuracy                           0.99       412
   macro avg       0.99      0.99      0.99       412
weighted avg       0.99      0.99      0.99       412

