#### Use breast_cancer dataset. Construct classification model using SVM classifier. Evaluate the model accuracy, precision and recall.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from category_encoders import TargetEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import warnings

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/rahul96rajan/sample_datasets/master/diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
X = data.drop('Outcome', axis=1)
y = data['Outcome']

In [5]:
y.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42)

In [7]:
X_train.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,614.0,614.0,614.0,614.0,614.0,614.0,614.0,614.0
mean,3.742671,120.855049,69.415309,20.399023,81.438111,31.983388,0.469168,32.907166
std,3.313264,32.035057,18.512599,15.433974,116.234835,7.740625,0.336847,11.503437
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,1.0,100.0,64.0,0.0,0.0,27.1,0.2415,24.0
50%,3.0,117.0,72.0,23.0,42.5,32.0,0.3725,29.0
75%,6.0,139.0,80.0,32.0,129.75,36.375,0.61375,40.0
max,17.0,199.0,122.0,63.0,846.0,67.1,2.42,81.0


In [8]:
# Custom Estimator
class FixFalseZeroes(BaseEstimator, TransformerMixin):
    """
    A custom class used to pre-process pima india diabetes dataset.

    Methods
    -------
    fit(X, y=None)
        Effectively nothing performed in fitting
    transform(X)
        Transforms the given dataframe such that:
            - the false zero in 'Glucose', 'BloodPressure',
             'SkinThickness', 'Insulin', 'BMI' features are changed to numpy.nan
            - form AgeGroup(categorical) feature from Age(continous).
            - converts 'Pregnancies' to Categorical
    """
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X):
        """
        This method transforms:
            -the false zero in 'Glucose', 'BloodPressure',
            'SkinThickness', 'Insulin', 'BMI' features to numpy.nan
            - form AgeGroup(categorical) feature from Age(continous).
            - converts 'Pregnancies' to Categorical

        Parameters
        ----------
        X : pandas.DataFrame

        Returns
        -------
        pandas.DataFrame
            a processed DataFrame.
        """
        Xdata = X.copy()        
        false_zeroes_features = ['Glucose', 'BloodPressure', 'SkinThickness',
                                 'Insulin', 'BMI']
        Xdata.loc[:, false_zeroes_features] = Xdata[false_zeroes_features].replace(0, np.nan)
        Xdata['AgeGroup'] = pd.cut(Xdata['Age'], [0, 5, 15, 30, 60, 120],
                                   labels=['child', 'teen', 'young_adult',
                                           'mature_adult', 'elderly'])
        Xdata['Pregnancies'] = pd.cut(Xdata['Pregnancies'], [0, 2, 5, 10, 15, 20],
                                   labels=['A', 'B', 'C',
                                           'D', 'E'])
        Xdata.drop('Age', axis=1, inplace=True)
        return Xdata

In [9]:
num_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
                'BMI', 'DiabetesPedigreeFunction']
cat_features = ['Pregnancies', 'AgeGroup']

In [10]:
num_transformer = Pipeline(steps=[('imp_med', SimpleImputer(strategy='median')),
                                      ('scaler', StandardScaler())])
cat_transformer = Pipeline([('imp_mod', SimpleImputer(strategy='most_frequent')),
                                    ('tarenc', TargetEncoder())])

In [11]:
preprocessor_CT = ColumnTransformer([('num', num_transformer, num_features),
                                  ('cat', cat_transformer, cat_features)])

In [12]:
preprop_pipe = Pipeline(steps=[('trimmer', FixFalseZeroes()),
                               ('col_trans', preprocessor_CT)])

In [13]:
warnings.filterwarnings('ignore') # To Supress FutureWarning

X_train_proc = preprop_pipe.fit_transform(X_train, y_train)
X_test_proc = preprop_pipe.transform(X_test)

In [14]:
svc = SVC(random_state=42)

params = [dict(kernel = ['linear', 'rbf'], 
             gamma = [0.1, 0.5, 1],
             C = [0.1, 0.5, 1, 1.5]),
         dict(kernel = ['poly'], degree = [2],
             C = [0.1, 0.5, 1, 1.5], gamma = [0.1, 1, 5])]

gscv_clf = GridSearchCV(svc, params, n_jobs=-1, cv=5, scoring='f1')

In [15]:
gscv_clf.fit(X_train_proc, y_train)

print('Best Estimator :: {0}\n\nScore :: {1}'.format(gscv_clf.best_estimator_,
                                                    gscv_clf.best_score_))

Best Estimator :: SVC(C=1, degree=2, gamma=5, kernel='poly', random_state=42)

Score :: 0.605112848545564


In [16]:
y_pred_train = gscv_clf.predict(X_train_proc)
y_pred_test = gscv_clf.predict(X_test_proc)

values = [[y_train, y_pred_train], [y_test, y_pred_test]]

In [17]:
accu_scores = map(lambda x: round(accuracy_score(x[0], x[1]), 2), values)
preci_scores = map(lambda x: round(precision_score(x[0], x[1]), 2), values)
recall_scores = map(lambda x: round(recall_score(x[0], x[1]), 2), values)

In [18]:
metric_df = pd.DataFrame({"Accuracy": accu_scores, "Precision": preci_scores,
                         "Recall": recall_scores}, index=['Training', 'Testing'])

display(metric_df)

Unnamed: 0,Accuracy,Precision,Recall
Training,0.79,0.77,0.58
Testing,0.77,0.7,0.64
