Kaggle: https://www.kaggle.com/uciml/breast-cancer-wisconsin-data

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder

In [None]:
TRAIN_DIR = '../input/breast-cancer-wisconsin-data/data.csv'

# Explantory Data Analyisis - Take a Glance at the Data

In [None]:
train = pd.read_csv(TRAIN_DIR, sep=',', header=0)
train = train.drop(['id', 'Unnamed: 32'], axis = 1)

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.describe()

# Prepare Data

In [None]:
X = train.drop(['diagnosis'], axis=1)
y = train['diagnosis'].apply(lambda x: 1 if x=='M' else -1)
print(f'X shape: {X.shape}')
print(f'y shape: {y.shape}')

# Explantory Data Analyisis - Take a look at the Label

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.countplot(y)

# Explantory Data Analyisis - Take a look at the Features

In [None]:
fig = plt.figure(figsize=(24, 18))
for i in range(len(X.columns)):
    plt.subplot(5, 6, i+1)
    plt.title(X.columns[i])
    plt.hist(X[X.columns[i]][y==-1], bins=25, color='lightblue', label='B-healthy')
    plt.hist(X[X.columns[i]][y==1], bins=25, color='grey', label='M-bad')

# Models

In [None]:
from sklearn.model_selection import cross_val_score

1.Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
cross_val_score(logreg, X, y, cv=8).mean()

2.Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=6)
cross_val_score(dt, X, y, cv=8).mean()

3.Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=20)
cross_val_score(rf, X, y, cv=8).mean()

4.Adaboost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(n_estimators=200)
cross_val_score(ada, X, y, cv=8).mean()

5.AdaBoost (Manual Implementation)

In [None]:
from sklearn.base import BaseEstimator
class AdaBoost(BaseEstimator):    
    def __init__(self, n_estimators):
        self.n_estimators = n_estimators 
        
    def fit(self, X, y):
        self.models = []
        self.model_weights = []
        
        num_samples = X.shape[0]
        alpha = np.ones(num_samples) / num_samples
        
        for m in range(self.n_estimators):
            tree = DecisionTreeClassifier(max_depth=3)
            tree.fit(X, y, sample_weight=alpha)
            prediction = tree.predict(X)
            weighted_error = alpha.dot(prediction != y)
            
            model_weight = 0.5 * (np.log(1 - weighted_error) - np.log(weighted_error))
            
            alpha = alpha * np.exp(-model_weight * y * prediction)
            alpha = alpha / alpha.sum()
            
            self.models.append(tree)
            self.model_weights.append(model_weight)          
              
    def predict(self, X):
        num_samples = len(X)
        results = np.zeros(num_samples)
        for model, model_weight in zip(self.models, self.model_weights):
            results += model_weight * model.predict(X)
        return np.sign(results)
    
    def score(self, X, y):
        prediction = self.predict(X)
        return np.mean(prediction == y)

In [None]:
ada2 = AdaBoost(n_estimators=200)
cross_val_score(ada2, X, y, cv=8).mean()