In [22]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import seaborn as sb
from sklearn.ensemble import AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

%matplotlib inline

In [2]:
base_directory = '~/workspace/personal/datasets/cancer'

columns = ['Sample code number', 'Clump Thickness',
'Uniformity of Cell Size',
'Uniformity of Cell Shape',
'Marginal Adhesion',
'Single Epithelial Cell Size',
'Bare Nuclei',
'Bland Chromatin',
'Normal Nucleoli',
'Mitoses',
'Class']

In [3]:
def using_model(model, scoring, cv):
    print("****************** {} ******************".format(model))
    if scoring is not None:
        scores_ = cross_val_score(model, X_train, y_train, scoring=scoring, cv=cv)
        print("scoring={} cv={}".format(scoring, cv), scores_.mean(), scores_.std())
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    print("accuracy_score=", accuracy_score(y_test, y_predict))
    print("precision_score=", precision_score(y_test, y_predict, average='micro'))
    print("recall_score=", recall_score(y_test, y_predict, average='micro'))

In [4]:
def load_csv_data(directory, file_name, names=None, header=None, skiprows=0, skipinitialspace=True):
    return pd.read_csv(os.path.join(directory, file_name), names=names, header=header, skiprows=skiprows, skipinitialspace=skipinitialspace)

In [5]:
def confusion_matrix(model, X_train, Y_train):
    from sklearn.metrics import confusion_matrix

    y_scores_ = cross_val_predict(model, X_train, Y_train, cv=3, method="decision_function")
    model.fit(X_train, Y_train)
    Y_predict = model.predict(X_train)
    conf_mx = confusion_matrix(Y_train, Y_predict)
    plt.matshow(conf_mx, cmap=plt.cm.gray)
#     print("accuracy_score=", accuracy_score(Y_train, Y_predict))
#     print("precision_score=", precision_score(Y_train, Y_predict))
#     print("recall_score=", recall_score(Y_train, Y_predict))
    print(lb.classes_)
    print(lb.transform(lb.classes_))
    print(conf_mx)
    row_sums = conf_mx.sum(axis=1, keepdims=True)
    norm_conf_mx = conf_mx / row_sums

    np.fill_diagonal(norm_conf_mx, 0)
    plt.matshow(norm_conf_mx, cmap=plt.cm.gray)

In [6]:
df = load_csv_data(base_directory, 'breast-cancer-wisconsin.data', names=columns)
df = df.drop('Sample code number', axis=1)

In [7]:
df.describe()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bland Chromatin,Normal Nucleoli,Mitoses,Class
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [8]:
df.loc[df['Bare Nuclei'] == '?', 'Bare Nuclei'] = np.nan
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Clump Thickness              699 non-null    int64 
 1   Uniformity of Cell Size      699 non-null    int64 
 2   Uniformity of Cell Shape     699 non-null    int64 
 3   Marginal Adhesion            699 non-null    int64 
 4   Single Epithelial Cell Size  699 non-null    int64 
 5   Bare Nuclei                  683 non-null    object
 6   Bland Chromatin              699 non-null    int64 
 7   Normal Nucleoli              699 non-null    int64 
 8   Mitoses                      699 non-null    int64 
 9   Class                        699 non-null    int64 
dtypes: int64(9), object(1)
memory usage: 54.7+ KB


In [9]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) 

for train_index, test_index in split.split(
    df.drop('Class', axis=1),
    df['Class']):
        train_set = df.loc[train_index]
        test_set = df.loc[test_index]

In [10]:
X_train = train_set.drop('Class', axis=1)
y_train = train_set['Class']

X_test = test_set.drop('Class', axis=1)
y_test = test_set['Class']


In [11]:
num_pipeline = Pipeline([
            ('imputer', SimpleImputer(strategy="median")),
            ('std_scaler', StandardScaler()),
            #('minmax_scaler', MinMaxScaler()),
        ])

X_train = num_pipeline.fit_transform(X_train)
X_test = num_pipeline.fit_transform(X_test)

lb = LabelEncoder()
y_train = lb.fit_transform(y_train)
y_test = lb.transform(y_test)

In [13]:
using_model(LogisticRegression(max_iter=500, multi_class="multinomial"), None, cv=10)

****************** LogisticRegression(max_iter=500, multi_class='multinomial') ******************
accuracy_score= 0.9571428571428572
precision_score= 0.9571428571428572
recall_score= 0.9571428571428572


In [14]:
using_model(SGDClassifier(), None, 10)

****************** SGDClassifier() ******************
accuracy_score= 0.9642857142857143
precision_score= 0.9642857142857143
recall_score= 0.9642857142857143


In [20]:
using_model(SVC(), None, 10)

****************** SVC() ******************
accuracy_score= 0.9571428571428572
precision_score= 0.9571428571428572
recall_score= 0.9571428571428572


In [15]:
using_model(RandomForestClassifier(), None, 10)

****************** RandomForestClassifier() ******************
accuracy_score= 0.9571428571428572
precision_score= 0.9571428571428572
recall_score= 0.9571428571428572


In [16]:
using_model(DecisionTreeClassifier(), None, 10)

****************** DecisionTreeClassifier() ******************
accuracy_score= 0.9285714285714286
precision_score= 0.9285714285714286
recall_score= 0.9285714285714286


In [17]:
a = AdaBoostClassifier(DecisionTreeClassifier(max_depth=15), n_estimators=225, 
                       algorithm="SAMME.R", learning_rate=0.3)
using_model(a, None, 10)

****************** AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=15),
                   learning_rate=0.3, n_estimators=225) ******************
accuracy_score= 0.9214285714285714
precision_score= 0.9214285714285714
recall_score= 0.9214285714285714


In [18]:
a1 = AdaBoostClassifier(LogisticRegression(max_iter=500, multi_class="multinomial"),
                        n_estimators=225, algorithm="SAMME.R", learning_rate=0.3)
using_model(a1, None, 10)

****************** AdaBoostClassifier(estimator=LogisticRegression(max_iter=500,
                                                multi_class='multinomial'),
                   learning_rate=0.3, n_estimators=225) ******************
accuracy_score= 0.9571428571428572
precision_score= 0.9571428571428572
recall_score= 0.9571428571428572


In [19]:
ovr = OneVsRestClassifier(SVC())

using_model(ovr, None, 10)

****************** OneVsRestClassifier(estimator=SVC()) ******************
accuracy_score= 0.9571428571428572
precision_score= 0.9571428571428572
recall_score= 0.9571428571428572


In [32]:
param_grid = [
        {'n_estimators': [50, 60, 70, 71, 75, 100, 120, 150], 'max_features': [2, 4, 6, 8]},
        {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
    ]


# m = RandomForestRegressor()
# m.fit(X_train, y_train)
    
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=10)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_estimator_)

{'max_features': 2, 'n_estimators': 120}
RandomForestClassifier(max_features=2, n_estimators=120)


In [33]:
using_model(RandomForestClassifier(max_features=2, n_estimators=120), None, 10)

****************** RandomForestClassifier(max_features=2, n_estimators=120) ******************
accuracy_score= 0.9642857142857143
precision_score= 0.9642857142857143
recall_score= 0.9642857142857143


In [55]:
param_grid = [
        {'max_iter': [500, 600, 1000, 1200, 1300, 1310, 1350, 1400],
         'penalty': ['l2', 'l1', 'elasticnet', None]}
    ]

grid_search = GridSearchCV(SGDClassifier(), param_grid, cv=10)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_estimator_)

using_model(SGDClassifier(max_iter=1300, penalty='elasticnet'), None, 10)

{'max_iter': 1200, 'penalty': 'l1'}
SGDClassifier(max_iter=1200, penalty='l1')
****************** SGDClassifier(max_iter=1300, penalty='elasticnet') ******************
accuracy_score= 0.9571428571428572
precision_score= 0.9571428571428572
recall_score= 0.9571428571428572
