In [1]:
import time
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

%matplotlib inline
plt.style.use('seaborn-dark-palette')

import warnings
warnings.filterwarnings('ignore')

In [2]:
file = glob.iglob('*.csv')
df = pd.read_csv(*file)

print(f'The dimension of the data is - {df.shape}')

The dimension of the data is - (683, 11)


In [3]:
df.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [4]:
df.tail()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
678,776715,3,1,1,1,3,2,1,1,1,2
679,841769,2,1,1,1,2,1,1,1,1,2
680,888820,5,10,10,3,7,3,8,10,2,4
681,897471,4,8,6,4,3,4,10,6,1,4
682,897471,4,8,8,5,4,5,10,4,1,4


In [5]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values

In [6]:
X

array([[1000025,       5,       1, ...,       3,       1,       1],
       [1002945,       5,       4, ...,       3,       2,       1],
       [1015425,       3,       1, ...,       3,       1,       1],
       ...,
       [ 888820,       5,      10, ...,       8,      10,       2],
       [ 897471,       4,       8, ...,      10,       6,       1],
       [ 897471,       4,       8, ...,      10,       4,       1]],
      dtype=int64)

In [7]:
Y

array([2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 4, 4, 2, 2, 4, 2, 4, 4,
       2, 2, 4, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 4, 4, 4, 4, 4, 4, 2,
       4, 2, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, 2, 4, 2, 4,
       4, 2, 2, 4, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 4, 4, 4, 4, 4, 2, 4, 2, 4,
       4, 4, 2, 2, 2, 4, 2, 2, 2, 2, 4, 4, 4, 2, 4, 2, 4, 2, 2, 2, 4, 2,
       2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 4, 2, 2, 4, 2, 4, 4, 2, 2, 4, 2, 2,
       4, 4, 2, 2, 2, 2, 4, 4, 2, 2, 2, 2, 2, 4, 4, 4, 2, 4, 2, 4, 2, 2,
       2, 4, 4, 2, 4, 4, 4, 2, 4, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2,
       2, 4, 4, 2, 2, 2, 4, 4, 2, 4, 4, 4, 2, 2, 4, 2, 2, 4, 4, 4, 4, 2,
       4, 4, 2, 4, 4, 4, 2, 4, 2, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2, 4, 4, 2,
       2, 4, 2, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4, 4, 2, 4, 4, 4, 2, 4, 2,
       4, 4, 2, 2, 2, 2, 4, 2, 2, 4, 4, 4, 4, 4, 2, 4, 4, 2, 2, 4, 4, 2,
       2, 4, 4, 2, 4, 2, 4, 4, 2, 2, 4, 2, 2, 2, 4,

In [8]:
print("Size of X: {}".format(X.shape))
print("Size of Y: {}".format(Y.shape))

Size of X: (683, 10)
Size of Y: (683,)


In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.25,
                                                    random_state=0,
                                                    shuffle=True)

In [10]:
print("Size of X_train: {}".format(X_train.shape))
print("Size of X_test: {}".format(X_test.shape))
print("Size of Y_train: {}".format(Y_train.shape))
print("Size of Y_test: {}".format(Y_test.shape))

Size of X_train: (512, 10)
Size of X_test: (171, 10)
Size of Y_train: (512,)
Size of Y_test: (171,)


In [11]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [12]:
classifier = XGBClassifier()
classifier.fit(X_train, Y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [13]:
y_pred = classifier.predict(X_test)
y_pred

array([2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 4, 2, 4, 2, 2, 2, 4, 4, 4, 2, 2, 2,
       4, 2, 4, 4, 2, 2, 2, 4, 2, 4, 4, 2, 2, 2, 4, 4, 2, 4, 2, 2, 2, 2,
       2, 2, 2, 4, 2, 2, 4, 2, 4, 2, 2, 2, 4, 4, 2, 4, 2, 2, 2, 2, 2, 2,
       2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 4, 2, 4, 2, 2, 4, 2, 4,
       4, 2, 4, 2, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 2, 2, 4, 4, 2, 2,
       4, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 4, 4, 2, 4, 2, 4, 2, 2,
       4, 2, 2, 4, 2, 4, 2, 2, 2, 2, 2, 2, 2, 4, 4, 2, 2, 2, 4, 2, 2, 2,
       2, 2, 4, 4, 2, 4, 4, 4, 4, 4, 4, 2, 2, 2, 2, 2, 2], dtype=int64)

In [14]:
cm = confusion_matrix(Y_test, y_pred)
cm

array([[103,   4],
       [  5,  59]], dtype=int64)

In [15]:
acc = accuracy_score(Y_test, y_pred)
print(f"The accuracy in percentage - {acc*100}%")

The accuracy in percentage - 94.73684210526315%


In [16]:
report = classification_report(Y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           2       0.95      0.96      0.96       107
           4       0.94      0.92      0.93        64

    accuracy                           0.95       171
   macro avg       0.95      0.94      0.94       171
weighted avg       0.95      0.95      0.95       171



In [17]:
acc = cross_val_score(estimator = classifier,
                      X = X_train,
                      y = Y_train,
                      n_jobs = -1,
                      verbose = 0,
                      cv = 10)
print(f"Accuracy Score: {acc.mean()*100:.3f}%")
print(f"Standard Deviation: {acc.std()*100:.2f} %")

Accuracy Score: 96.286%
Standard Deviation: 2.84 %


In [18]:
# start = time.time()

# parameters = [{'C': [0.25, 0.5, 0.75, 1], 
#                'kernel': ['linear']},
#               {'C': [0.25, 0.5, 0.75, 1], 
#                'kernel': ['rbf'], 
#                'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]

# grid_search = GridSearchCV(estimator = classifier,
#                            param_grid = parameters,
#                            scoring = 'accuracy',
#                            n_jobs = -1,
#                            cv = 10,
#                            verbose = 1
#                           )
# grid_search.fit(X_train, Y_train)
# best_accuracy = grid_search.best_score_
# best_parameters = grid_search.best_params_
# print(f"Accuracy Score: {best_accuracy*100:.3f}%")
# print(f"Best Parameters: {best_parameters}")
# end = time.time()
# print(f"Total Time Taken {end - start}")

In [19]:
# # Training Set
# figure = plt.figure(figsize = (10,10))

# x_set, y_set = X_train, Y_train

# X1, X2 = np.meshgrid(np.arange(start = x_set[:, 0].min() - 1,
#                                stop = x_set[:, 0].max() + 1,
#                               step = 0.01),
#                     np.arange(start = x_set[:, 1].min() - 1,
#                                stop = x_set[:, 1].max() + 1,
#                               step = 0.01))

# plt.contourf(X1,
#              X2,
#              classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
#              alpha = 0.4,
#              cmap = ListedColormap(('red', 'green')))

# for i, j in enumerate(np.unique(y_set)):
#     plt.scatter(x_set[y_set == j, 0],
#                 x_set[y_set == j, 1],
#                 color = ListedColormap(('red', 'green'))(i),
#                 s = 15,
#                 marker = '*',
#                 label = j
#                )
# plt.xlim(X1.min(), X1.max())
# plt.ylim(X2.min(), X2.max())
# plt.title('Kernel - SVM Classifier (Training Set)')
# plt.xlabel('Age')
# plt.ylabel('Estimated Salary')
# plt.legend()

In [20]:
# # Visuaizing the test case result
# figure = plt.figure(figsize = (10,10))

# x_set, y_set = X_test, Y_test

# X1, X2 = np.meshgrid(np.arange(start = x_set[:, 0].min() - 1, 
#                                stop = x_set[:, 0].max() + 1,
#                                step = 0.01),
#                      np.arange(start = x_set[:, 1].min() - 1,
#                                stop = x_set[:, 1].max() + 1,
#                                step = 0.01))

# plt.contourf(X1,
#              X2,
#              classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
#              cmap = ListedColormap(('red', 'green')),
#              alpha = 0.4
#             )

# for i, j in enumerate(np.unique(y_set)):
#     plt.scatter(x_set[y_set == j, 0],
#                 x_set[y_set == j, 1 ],
#                 color = ListedColormap(('red', 'green'))(i),
#                 s = 15,
#                 label = j,
#                 marker = '^'
#                )
# plt.xlim(X1.min(), X1.max())
# plt.ylim(X2.min(), X2.max())
# plt.title("Kernel SVM - Test Case")
# plt.xlabel('Age')
# plt.ylabel('Estimated Salary')
# plt.legend()