In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Data Ref: https://www.kaggle.com/uciml/breast-cancer-wisconsin-data


In [3]:
data = pd.read_csv('cancer.csv')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
id                   569 non-null int64
diagnosis            569 non-null object
radius_mean          569 non-null float64
texture_mean         569 non-null float64
perimeter_mean       569 non-null float64
area_mean            569 non-null float64
smoothness_mean      569 non-null float64
compactness_mean     569 non-null float64
concavity_mean       569 non-null float64
points_mean          569 non-null float64
symmetry_mean        569 non-null float64
dimension_mean       569 non-null float64
radius_se            569 non-null float64
texture_se           569 non-null float64
perimeter_se         569 non-null float64
area_se              569 non-null float64
smoothness_se        569 non-null float64
compactness_se       569 non-null float64
concavity_se         569 non-null float64
points_se            569 non-null float64
symmetry_se          569 non-null float64
dimension_se    

In [5]:
data.diagnosis.unique()

array(['B', 'M'], dtype=object)

In [6]:
data.diagnosis.value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

In [7]:
features = data.iloc[:,2:].values
label = data.iloc[:,1].values

# XGBoost - XGBoostClassifier 
### This classifier internally uses decision trees to create the model. Decision Trees ideally will lead to overfitting. To avoid that you can fine tune the model using learning_rate hyperparameter.

#### The ideal range for learning_rate is between 0 to 1
#### Always start with 0.001 , Step Size is 0.001

In [7]:
#pip install xgboost

In [8]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

for i in range(1,101):
    X_train,X_test,y_train,y_test = train_test_split(features,
                                                    label,
                                                    test_size=0.2,
                                                    random_state=i)
    
    model = XGBClassifier()
    model.fit(X_train,y_train)
    
    train_score=model.score(X_train,y_train)
    test_score=model.score(X_test,y_test)
    #print("Testing: {}, Train: {}, RS: {}".format(test_score,train_score,i))
    if test_score > train_score:
        print("Testing: {}, Train: {}, RS: {}".format(test_score,train_score,i))

In [9]:
#Above Shows the issue of OVERFITTING. One way to play around overfitting issue is to control the learning
#rate

In [10]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

for i in range(1,101):
    X_train,X_test,y_train,y_test = train_test_split(features,
                                                    label,
                                                    test_size=0.2,
                                                    random_state=i)
    
    model = XGBClassifier(learning_rate=0.01)
    model.fit(X_train,y_train)
    
    train_score=model.score(X_train,y_train)
    test_score=model.score(X_test,y_test)
    #print("Testing: {}, Train: {}, RS: {}".format(test_score,train_score,i))
    if test_score > train_score:
        print("Testing: {}, Train: {}, RS: {}".format(test_score,train_score,i))

Testing: 0.9912280701754386, Train: 0.9846153846153847, RS: 42
Testing: 0.9912280701754386, Train: 0.9846153846153847, RS: 44
Testing: 0.9824561403508771, Train: 0.9802197802197802, RS: 54
Testing: 0.9912280701754386, Train: 0.9868131868131869, RS: 72


### If XGBoost Fails, Go for XGRFBOOST

# XGRFBoost uses RandomForest algorithm

In [11]:
#Steps to perform in your lab
# 1. sudo pip install --upgrade pip
# 2. sudo pip install xgboost

In [8]:
from sklearn.model_selection import train_test_split
from xgboost import XGBRFClassifier

for i in range(1,101):
    X_train,X_test,y_train,y_test = train_test_split(features,
                                                    label,
                                                    test_size=0.2,
                                                    random_state=i)
    
    model = XGBRFClassifier()
    model.fit(X_train,y_train)
    
    train_score=model.score(X_train,y_train)
    test_score=model.score(X_test,y_test)
    #print("Testing: {}, Train: {}, RS: {}".format(test_score,train_score,i))
    if test_score > train_score:
        print("Testing: {}, Train: {}, RS: {}".format(test_score,train_score,i))

ImportError: cannot import name 'XGBRFClassifier' from 'xgboost' (/opt/anaconda3/lib/python3.7/site-packages/xgboost/__init__.py)

# Adaptive Boosting

In [13]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
algorithm = LogisticRegression()
algorithmSVM = SVC(kernel="linear")

model1 = AdaBoostClassifier(base_estimator=algorithm,
                           n_estimators=101)

In [16]:
for i in range(1,101):
    X_train,X_test,y_train,y_test = train_test_split(features,
                                                    label,
                                                    test_size=0.2,
                                                    random_state=i)
    
    model = AdaBoostClassifier(base_estimator=algorithmSVM,algorithm='SAMME',n_estimators=101)
    model.fit(X_train,y_train)
    
    train_score=model.score(X_train,y_train)
    test_score=model.score(X_test,y_test)
    #print("Testing: {}, Train: {}, RS: {}".format(test_score,train_score,i))
    if test_score > train_score:
        print("Testing: {}, Train: {}, RS: {}".format(test_score,train_score,i))

Testing: 0.9649122807017544, Train: 0.9516483516483516, RS: 2
Testing: 0.9649122807017544, Train: 0.9472527472527472, RS: 9
Testing: 0.9473684210526315, Train: 0.9406593406593406, RS: 10
Testing: 0.9736842105263158, Train: 0.9472527472527472, RS: 14
Testing: 0.956140350877193, Train: 0.9516483516483516, RS: 15
Testing: 0.9736842105263158, Train: 0.945054945054945, RS: 16
Testing: 0.956140350877193, Train: 0.9560439560439561, RS: 19
Testing: 0.9649122807017544, Train: 0.9538461538461539, RS: 21
Testing: 0.956140350877193, Train: 0.9516483516483516, RS: 22
Testing: 0.956140350877193, Train: 0.9494505494505494, RS: 25
Testing: 0.956140350877193, Train: 0.9516483516483516, RS: 30
Testing: 0.9473684210526315, Train: 0.9472527472527472, RS: 31
Testing: 0.9649122807017544, Train: 0.9494505494505494, RS: 32
Testing: 0.956140350877193, Train: 0.9494505494505494, RS: 33
Testing: 0.956140350877193, Train: 0.9472527472527472, RS: 35
Testing: 0.9649122807017544, Train: 0.9494505494505494, RS: 36
Te