In [32]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sb
from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
def load_data(directory, file_name, names=None, header=None, skiprows=0):
    return pd.read_excel(os.path.join(directory, file_name), names=names, header=header, skiprows=skiprows, skipinitialspace=True)

In [3]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [4]:
def scoring_using_cross_validation(m, x, y, s='accuracy'):
    print("************* Start of Cross Validation {} {} *************".format(s, m))
    scores_ = cross_val_score(m, x, y, scoring="accuracy", cv=10)
    rmse_scores_ = np.sqrt(scores_)
    display_scores(rmse_scores_)
    print("************* End of Cross Validation {} {}*************".format(s, m))


def print_scores(m, Y_test, Y_predictions):
    print("************* {} *************".format(m))
    mse_ = mean_squared_error(Y_test, Y_predictions)
    print("mse ", mse_)
    acc_ = accuracy_score(Y_test, Y_predictions)
    print("acc ", acc_)
    print("****************************************")


In [19]:
base_directory = '~/workspace/personal/datasets/DryBeanDataset'

columns = ['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'AspectRation', 'Eccentricity', 
           'ConvexArea', 'EquivDiameter', 'Extent', 'Solidity', 'roundness', 'Compactness', 'ShapeFactor1',
           'ShapeFactor2', 'ShapeFactor3', 'ShapeFactor4']


In [17]:
dry_bean_df = pd.read_excel(os.path.join(base_directory, 'Dry_Bean_Dataset.xlsx'))
print(list(dry_bean_df))

['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength', 'AspectRation', 'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent', 'Solidity', 'roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2', 'ShapeFactor3', 'ShapeFactor4', 'Class']


In [20]:
split = ShuffleSplit(test_size=0.20, random_state=1)
train_index, test_index = list(split.split(dry_bean_df[columns]))[0]

In [27]:
num_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
    ])

ct = ColumnTransformer([
        ("norm2", num_pipeline, columns)
], remainder='passthrough')

lb = LabelEncoder()

X_train_set = dry_bean_df.loc[train_index].drop(['Class'], axis=1)
X_test_set = dry_bean_df.loc[test_index].drop(['Class'], axis=1)

Y_train_set = dry_bean_df['Class'].copy().loc[train_index]
Y_test_set = dry_bean_df['Class'].copy().loc[test_index]

In [28]:
X_train = ct.fit_transform(X_train_set)
Y_train = lb.fit_transform(Y_train_set)

X_test = ct.transform(X_train_set)
Y_test = lb.transform(Y_train_set)

In [29]:
sgd = SGDClassifier()
sgd.fit(X_train, Y_train)

SGDClassifier()

In [41]:
Y_predictions = sgd.predict(X_test)


precision_ = precision_score(Y_test, Y_predictions, average='weighted')
print("precision_ ", precision_)
recall_ = recall_score(Y_test, Y_predictions, average='weighted')
print("recall_ ", recall_)
f1_score_ = f1_score(Y_test, Y_predictions, average='weighted')
print("f1_score_ ", f1_score_)

precision_  0.9147481190647714
recall_  0.911462160176341
f1_score_  0.9122745316683003


In [46]:
lb.inverse_transform(Y_predictions[:20,])

array(['BARBUNYA', 'SIRA', 'DERMASON', 'BARBUNYA', 'CALI', 'SIRA', 'CALI',
       'HOROZ', 'DERMASON', 'BARBUNYA', 'HOROZ', 'HOROZ', 'SEKER',
       'SEKER', 'SIRA', 'SEKER', 'BOMBAY', 'BARBUNYA', 'DERMASON',
       'HOROZ'], dtype=object)

In [47]:
lb.inverse_transform(Y_test[:20,])

array(['BARBUNYA', 'SIRA', 'SIRA', 'BARBUNYA', 'CALI', 'SIRA', 'CALI',
       'HOROZ', 'DERMASON', 'BARBUNYA', 'HOROZ', 'HOROZ', 'SEKER',
       'SEKER', 'CALI', 'SEKER', 'BOMBAY', 'BARBUNYA', 'DERMASON',
       'HOROZ'], dtype=object)