## Read in libraries and import functions

In [None]:
import numpy as np
import pandas as pd
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

In [None]:
# Read and clean dataset
df =  pd.read_csv('../input/data.csv', header=0)
print(df.columns)

df = df[df.notnull()].copy()
df = df.drop("Unnamed: 32",1)

In [None]:
from sklearn.model_selection import train_test_split

#Split dataset into Train and Test
df_train, df_test = train_test_split(df, test_size = 0.3)

### Convert "Diagnosis" from string values 'B'/'M' to binary (0/1)

In [None]:
#Benign = 0 and Malignant = 1
df_train["diagnosis"] = df_train["diagnosis"].apply(lambda diagnosis: 0 if diagnosis == "B" else 1)
df_test["diagnosis"] = df_test["diagnosis"].apply(lambda diagnosis: 0 if diagnosis == "B" else 1)

## Dataset Characteristics

In [None]:
#Display header data
df_train.head()

In [None]:
#Describe key stats
print(df_train.describe())

#Confirm whether there is missing data
print(df_train.shape, df_train.isnull().sum())  
#print(df.isnull().sum())

## Separate columns into smaller dataframes to perform visualisations

In [None]:
#Break up columns into groups, according to their suffix designation (_mean, _se,
# and __worst) to perform visualisation plots off. Join the 'ID' and 'Diagnosis' back on
df_id_diag=df_train.loc[:,["id","diagnosis"]]
df_diag=df_train.loc[:,["diagnosis"]]

#For a merge + slice:
df_cut1=df_train.ix[:,2:11]
df_cut2=df_train.ix[:,12:21]
df_cut3=df_train.ix[:,22:]

#print(df_id_diag.columns)
print(df_cut1.columns)
print(df_cut2.columns)
print(df_cut3.columns)

## Visualise distribution of data via histograms

In [None]:
#Plot histograms of CUT1 variables
df_cut1.hist(bins=10, figsize=(10, 10))

#Any individual histograms, use this:
#df_cut['radius_worst'].hist(bins=100)

In [None]:
#Plot histograms of CUT2 variables
df_cut2.hist(bins=10, figsize=(10, 10))

In [None]:
#Plot histograms of CUT3 variables
df_cut3.hist(bins=10, figsize=(10, 10))

### Box plot _MEAN variables

In [None]:
#Plot _mean predictor variables (i.e. df_cut1)
plt.rcParams['figure.figsize']=(15,11)
sns.boxplot(df_cut1)

Initial assumption was that as with many biological phenomena, tumours may be displaying logarithmic distribution. Data will be log transformed and re-displayed in box plots

In [None]:
#Log transform data in dataframe
log_columns = ['radius_mean','texture_mean','perimeter_mean','area_mean','smoothness_mean','compactness_mean','concavity_mean',
       'concave points_mean','symmetry_mean']
df_cut1_log_trans=df_cut1.loc[:,log_columns]
df_cut1_log_trans[log_columns] = df_cut1_log_trans[log_columns].apply(np.log10)

In [None]:
#Reproduce boxplot for _mean variables
plt.rcParams['figure.figsize']=(15,11)
sns.boxplot(df_cut1_log_trans)

### Box plot _SE variables

In [None]:
#Plot _se predictor variables (i.e. df_cut2)
plt.rcParams['figure.figsize']=(10,10)
sns.boxplot(df_cut2)

In [None]:
#Log transform data in dataframe
log_columns = ['radius_se', 'texture_se', 'perimeter_se', 'area_se','smoothness_se', 'compactness_se', 'concavity_se',
       'concave points_se', 'symmetry_se']
df_cut2_log_trans=df_cut2.loc[:,log_columns]
df_cut2_log_trans[log_columns] = df_cut2_log_trans[log_columns].apply(np.log10)

In [None]:
#Reproduce boxplot for _SE variables
plt.rcParams['figure.figsize']=(15,11)
sns.boxplot(df_cut1_log_trans)

### Box plot _WORST variables

In [None]:
#Plot _WORST predictor variables (i.e. df_cut3)
plt.rcParams['figure.figsize']=(10,10)
sns.boxplot(df_cut3)

In [None]:
#Log transform data in dataframe
log_columns = ['radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
       'smoothness_worst', 'compactness_worst', 'concavity_worst',
       'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']
df_cut3_log_trans=df_cut3.loc[:,log_columns]
df_cut3_log_trans[log_columns] = df_cut3_log_trans[log_columns].apply(np.log10)

In [None]:
#Reproduce boxplot for _SE variables
plt.rcParams['figure.figsize']=(15,11)
sns.boxplot(df_cut3_log_trans)

After log transformations, data seems relatively well normally distributed. Now visualisation analysis will move to identifying whether categorisation is feasible and degree of colinearity using pairplot distributions. It should be noted that distribution of data overlaps heavily between smoothness, compactness, concavity, concave points, symmetry, and fractal dimension for all meta-parameters (mean, se, and worst)

## Visualisation: Pairplots

Due to the number of variables, each dataframe cut will be assessed individually. This is not ideal, as it would be preferable to visualise each predictor plotted against one another. However, given that each dataframe cut (mean, se, worst) has a respective version of each of the parameters (radius, texture, perimeter, area, smoothness, compactness, concavity, concave points, symmetry, fractal_dimension), it is assumed that collinear relationships will carry across cuts

In [None]:
#Merge back on Diagnosis (to allow discrimination) for the dataframe containing _MEAN predictor variables before plotting pairplot
df_diag_cut1=df_diag.join(df_train.ix[:,2:11])

# Quick plot of the data using seaborn
sns.pairplot(df_diag_cut1, hue = "diagnosis")
sns.plt.show()

In [None]:
#Repeat for dataframe with _SE before plotting pairplot
df_diag_cut2=df_diag.join(df_train.ix[:,12:21])

# Quick plot of the data using seaborn
sns.pairplot(df_diag_cut2, hue = "diagnosis")
sns.plt.show()

In [None]:
#Merge back on ID and Diagnosis before plotting pairplot
df_diag_cut3=df_diag.join(df_train.ix[:,22:])

# Quick plot of the data using seaborn
sns.pairplot(df_diag_cut3, hue = "diagnosis")
sns.plt.show()

# Begin model building

### Take slices of the data to create the training (X) and target (Y) arrays

In [None]:
#Specify Target (i.e. class labels in classification): 
Y_train = df_train.ix[:,1]

#Do the same for the Training sample space
X_train = df_train.ix[:,2:]

#Y_train.head()
#X_train.head()

In [None]:
#Specify target from the TEST dataset
Y_test = df_test.ix[:,1]

#Cut to match training df
X_test = df_test.ix[:,2:]


#Create a dataset of the test data's 'true' results to enable a comparison 
#with model output if required for Confusion Matrices
df_true_diagnosis = df_test[["diagnosis"]]

### Plot the decision surfaces of various classification models to get an idea of how they fit to data

#### Each variable pair will use Concave Points Mean (index:7) plotted against progressively ranked important features Area_Worst (23), Concave Points_Worst (27), Perimeter_Worst (22) and Radius_Worst (20)

In [None]:
#http://scikit-learn.org/stable/auto_examples/svm/plot_iris.html#sphx-glr-auto-examples-svm-plot-iris-py
print(__doc__)
from sklearn.svm import SVC,LinearSVC
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import clone

#Don't scale data since we want to plot the support vectors
n_estimators = 150

# Models to plot
models = [LogisticRegression(C=1),
          ExtraTreesClassifier(n_estimators=150),
          SVC(kernel='linear', C=1),
          SVC(kernel='poly', degree=2, C=1000, probability = True),
         GradientBoostingClassifier(n_estimators=150),
         AdaBoostClassifier(n_estimators=150)]

#Optional: RandomForestClassifier(n_estimators=n_estimators),

# title for the plots
titles = ['Logit Reg','Extra Trees',
          'SVC (linear kernel)','SVC (poly kernel)',
         'GradBoost','Adaboost']

for pair in ([7, 23], [7, 27], [7,22]): #[7,20]
    for i, clf in enumerate(models):
        #Log transform pairs
        #X = X_train.ix[:,pair]
        #X.ix[pair] = X.ix[pair].apply(np.log10)
        
        #Normalise
        X = X_train.ix[:, pair]
        X = ((X - X.mean())/X.std())
                
        #Pass dataframes to variables
        # = X_train
        y = Y_train

        h = .01 #h = step size in the mesh, want the decision surface to be very fine
        
        # create a mesh to plot in
        x_min, x_max = X.ix[:, 0].min() - 0.05, X.ix[:, 0].max() + 0.05
        y_min, y_max = X.ix[:, 1].min() - 0.05, X.ix[:, 1].max() + 0.05
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, x_max]x[y_min, y_max].
        plt.subplot(2, 3, i+1)
        #plt.subplot(2, 2, i+1)
        plt.subplots_adjust(wspace=0.4, hspace=0.4)
        
        # Train
        clf.fit(X, y)
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
        
        #Get scores
        score = clf.score(X, y)
                
        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)

        # Also plot the training points
        plt.scatter(X.ix[:, 0], X.ix[:, 1], c=y, cmap=plt.cm.coolwarm)
        plt.xlabel(X.columns.values[0])
        plt.ylabel(X.columns.values[1])
        plt.xlim(xx.min(), xx.max())
        plt.ylim(yy.min(), yy.max())
        plt.xticks(())
        plt.yticks(())
        plt.title(titles[i] + " Score: {:0.5f}".format(score))
    
    print("{} vs {}".format(X.columns.values[0],X.columns.values[1]))
    plt.show()
    print("_____________________________________________________________________")

## Parameter estimation using grid search with cross-validation

In [None]:
#http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_digits.html
from __future__ import print_function
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold
from sklearn.metrics import classification_report,roc_curve, auc
from sklearn.svm import SVC,LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,GradientBoostingClassifier,AdaBoostClassifier
print(__doc__)

#Scoring options for GridsearchCV
#scores = ['precision_macro', 'recall_macro','f1_macro','accuracy','roc_auc']
scores = ['accuracy']

def run_model(model_type,model_short_name,tuning_parameters):
    for score in scores:
        print("# Tuning hyper-parameters for %s on scoring method: %s" % (model_short_name,score))
        print()

        clf = GridSearchCV(model_type, tuning_parameters, cv=10,
                           scoring='%s' % score)
        clf.fit(X_train, Y_train)
             
        print("Best parameters set found on development set:")
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.5f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = Y_test, clf.predict(X_test)
        print(classification_report(y_true, y_pred))
        print()
        
    return 

#Run models and tuning parameters to optimise ROC_AUC

#1: Logistic Regression
run_model(LogisticRegression(),'Logit',[{'C': [1, 10, 100, 1000]}])

#2: Random Forest
run_model(RandomForestClassifier(n_jobs=-1),'RF',[{'n_estimators':[10,50,100,150,200,250]}])

#3: Extra Trees Classifier
run_model(ExtraTreesClassifier(n_jobs=-1),'ET',[{'n_estimators':[10,50,100,150,200,250]}])

#4: SVM Classifier
run_model(SVC(C=1),'SVM',[{'kernel': ['linear'],'C': [1, 10, 100, 1000]}])

#5: Gradient Boosting Classifier
# Optional paramaters: learning_rate=1.0,max_depth=1,random_state=0
run_model(GradientBoostingClassifier(),'GradientBoost',[{'n_estimators':[10,50,100,150,200,250]}])

#6: Adaboosting Classifier
# Optional paramaters: learning_rate=1.0,random_state=0
run_model(AdaBoostClassifier(),'AdaBoost',[{'n_estimators':[10,50,100,150,200,250]}])

## Rerun GridsearchCV using log transformed data

As observed above in the exploratory data analysis, log transformed data provided far greater clarity into the data and a lot more explanatory power. Transform data and see whether it improves accuracy at all

In [None]:
#Define new Training sample space with features consistently defined in the top 10 of RF and ET models
#Cut off ID and DIAGNOSIS columns
df_id_diag_train=df_train.loc[:,["id","diagnosis"]]
df_id_diag_test=df_test.loc[:,["id","diagnosis"]]

#Log transform
X_train_log = df_train.ix[:,2:].apply(np.log10)
X_test_log = df_test.ix[:,2:].apply(np.log10)

#Merge back on ID and DIAGNOSIS columns
X_train_log=df_id_diag_train.join(X_train_log)
X_test_log=df_id_diag_test.join(X_test_log)

#Compare before and after log transforms
#df_train.describe()
X_train_log.describe()

#-inf results in all columns with 'concavity', will just drop
X_train_log = X_train_log.drop(['concavity_mean','concave points_mean','concavity_se','concave points_se','concavity_worst','concave points_worst'], 1)
X_test_log = X_test_log.drop(['concavity_mean','concave points_mean','concavity_se','concave points_se','concavity_worst','concave points_worst'], 1)

X_train_log.describe()

In [None]:
from __future__ import print_function
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold
from sklearn.metrics import classification_report,roc_curve, auc
from sklearn.svm import SVC,LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,GradientBoostingClassifier,AdaBoostClassifier
print(__doc__)

#Re-run GRIDSEARCH with log transformed data
scores = ['accuracy']

def run_model(model_type,model_short_name,tuning_parameters):
    for score in scores:
        print("# Tuning hyper-parameters for %s on scoring method: %s" % (model_short_name,score))
        print()

        clf = GridSearchCV(model_type, tuning_parameters, cv=10,
                           scoring='%s' % score)
        clf.fit(X_train_log, Y_train)
             
        print("Best parameters set found on development set:")
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.5f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        y_true, y_pred = Y_test, clf.predict(X_test_log)
        print(classification_report(y_true, y_pred))
        print()
        
    return 

#Run models and tuning parameters to optimise ROC_AUC

#1: Logistic Regression
run_model(LogisticRegression(),'Logit',[{'C': [1, 10, 100, 1000]}])

#2: Random Forest
run_model(RandomForestClassifier(n_jobs=-1),'RF',[{'n_estimators':[10,50,100,150,200,250]}])

#3: Extra Trees Classifier
run_model(ExtraTreesClassifier(n_jobs=-1),'ET',[{'n_estimators':[10,50,100,150,200,250]}])

#4: SVM Classifier (taking too long)
#run_model(SVC(C=1),'SVM',[{'kernel': ['linear'],'C': [1, 10, 100, 1000]}])

#5: Gradient Boosting Classifier
# Optional paramaters: learning_rate=1.0,max_depth=1,random_state=0
run_model(GradientBoostingClassifier(),'GradientBoost',[{'n_estimators':[10,50,100,150,200,250]}])

#6: Adaboosting Classifier
# Optional paramaters: learning_rate=1.0,random_state=0
run_model(AdaBoostClassifier(),'AdaBoost',[{'n_estimators':[10,50,100,150,200,250]}])

100% accuracy on Extra Trees Classifier, Gradient Boost and Adaboost