In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


#models
from sklearn.ensemble import RandomForestClassifier


#misc
from sklearn.model_selection import cross_val_score,train_test_split
from scipy.stats import f_oneway

In [None]:
data=pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')

In [None]:
data.head()

In [None]:
data.describe(include='all')

In [None]:
#lets store the diagnosis as y
y=data['diagnosis'].copy()
X=data.drop(['id','Unnamed: 32','diagnosis'],axis=1).copy()
#lets drop id,unnamed 32 and diagnosis

#lets store the diagnosis as y
y1=data['diagnosis'].copy()
X1=data.drop(['id','Unnamed: 32','diagnosis'],axis=1).copy()
#lets drop id,unnamed 32 and diagnosis





In [None]:
#check the distribution of targets
sns.countplot(y)

In [None]:
cols=X.columns

In [None]:
X.describe()

In [None]:
#standardize the features

In [None]:
std_X=(X-X.mean())/X.std()

In [None]:
std_X.describe()

In [None]:
#Lets fit a model on all the features and see the performance

In [None]:
rf=RandomForestClassifier()
trainx,testx,trainy,testy=train_test_split(std_X,y,test_size=0.25)
rf.fit(trainx,trainy)

scores=cross_val_score(rf,trainx,trainy,cv=5)

In [None]:
rf.feature_importances_

In [None]:
sorted({a:b for a,b in list(zip(trainx.columns,rf.feature_importances_))}.items(),key=lambda item:item[1],reverse=True)

In [None]:
scores.mean()

In [None]:
#lets do feature selection and fit the model again

In [None]:
std_X_Y=pd.concat([std_X,y],axis=1)

In [None]:
std_X_Y

In [None]:
melted_std_X_Y=pd.melt(std_X_Y,id_vars='diagnosis',var_name='features',value_name='value')
melted_std_X_Y

In [None]:
plot1,ax=plt.subplots(nrows=3,ncols=1,figsize=(20,15))
#plt.figure(figsize=(10,8))
for i in range(1,4):    
    print(i)
    print(cols[(i-1)*10:i*10])
    sns.violinplot(data=melted_std_X_Y[melted_std_X_Y['features'].isin(cols[(i-1)*10:i*10])],x='features',y='value',hue="diagnosis",split=True,inner='quart',ax=ax[i-1])
    #ax[i-1].xticks(rotation="90")
    plt.xticks(rotation="90")
#from the below violin plot we can see that
#1)radius_mean,area_mean,perimeter_mean,concavity_mean,compactness_mean,concavepoints_mean has distinctive distrubutions for B and M diagnosis,as the median of both diagnosis dont overlap each other
#2)symmetry_mean,smoothness_mean and dimension_mean does not have very distinctive means for B and M diagnosis,as the medians of both distribution overlap

#Features with distinctive distributions have good chances of being  good predictors of the target variable.
#Features with overlapping distributions may not be good predictors of the target variable.

In [None]:
#Lets look at variables that are similiar to one another
#Concavity_worst and concave points worst look similiar
#lets try a regg plot involving both

sns.jointplot(X.loc[:,'concavity_worst'],X.loc[:,'concave points_worst'],kind='regg')

In [None]:
#they look very cloesly positively correlated.hence we can remoev one of them

In [None]:
#if more than 2 variables are correlated,then we can use pair grid plot

In [None]:
#lets take the exmaple of radius_worst,perimeter_worst and area_worst

In [None]:
g=sns.pairplot(X.loc[:,['radius_worst','area_worst','perimeter_worst']])
g.map_lower(sns.kdeplot,cmp="Blues_d")

In [None]:
#they  all look correlated to each other,we can remove two of them which have lower correlation with the target.

In [None]:
#Lets try swarm plot

In [None]:
plot1,ax=plt.subplots(nrows=3,ncols=1,figsize=(20,15))
#plt.figure(figsize=(10,8))
for i in range(1,4):    
    print(i)
    print(cols[(i-1)*10:i*10])
    sns.swarmplot(data=melted_std_X_Y[melted_std_X_Y['features'].isin(cols[(i-1)*10:i*10])],x='features',y='value',hue="diagnosis",ax=ax[i-1])
    #ax[i-1].xticks(rotation="90")
    plt.xticks(rotation="90")
#from the below violin plot we can see that
#1)radius_mean,area_mean,perimeter_mean,concavity_mean,compactness_mean,concavepoints_mean has distinctive distrubutions for B and M diagnosis,as the median of both diagnosis dont overlap each other
#2)symmetry_mean,smoothness_mean and dimension_mean does not have very distinctive means for B and M diagnosis,as the medians of both distribution overlap

#Features with distinctive distributions have good chances of being  good predictors of the target variable.
#Features with overlapping distributions may not be good predictors of the target variable.

In [None]:
plt.hist([data[data['diagnosis']=='M']['area_mean'],data[data['diagnosis']=='B']['area_mean']],bins=100,range=[143,2500],alpha=0.5)

In [None]:
data['area_mean']

In [None]:
#some of the features look really distinctive of target class,few overlap

In [None]:
#lets try a correlation heatmap of all features

In [None]:
plt.subplots(figsize=(18,18))
sns.heatmap(X.corr(),annot=True,fmt='.1f')

In [None]:
X_corr=np.triu(np.abs(np.around(X.corr(),decimals=1)),k=1)
correlated_x,correlated_y=np.where(X_corr>0.8)
correlated_features=set([ (list(X.corr().index)[a[0]],list(X.corr().columns)[a[1]]) for a in list(zip(correlated_x,correlated_y))])
#we have a list of columns that are highly correlated with each other

In [None]:
len(correlated_features)

In [None]:
correlated_features

In [None]:
#https://www.quora.com/How-do-you-explain-that-A-correlates-with-B-and-B-correlates-with-C-but-A-does-not-correlate-has-zero-correlation-with-C

In [None]:
#lets recursively retain the variable which has high correlation with target variable out of each pair and find correlation pairs again

In [None]:
#find correlated pairs
#out of each pair retain the variable that has high correlation  with target
#find correlation of remaining variables
#repeat the same process again till there are no high correlated pairs

In [None]:

def fn_calc_anova_pairs(pairs):
    """This function calculates 1 way anova between target variable and each variable in the correlated pairs.
    it returns the variable
    
    """
    B=list(data.groupby('diagnosis'))[0][1]
    M=list(data.groupby('diagnosis'))[1][1]
    #print(B)
    #print(M)
    F0,p0=f_oneway(B[pairs[0]].values,M[pairs[0]].values)
    F1,p1=f_oneway(B[pairs[1]].values,M[pairs[1]].values)
    #print(B[pairs[0]])
    #print(M[pairs[0]])
    p0=np.around(p0,2)
    p1=np.around(p1,2)
    #print('p0 is',p0)
    #print('p1 is',p1)
    if p0<=p1:
        if p0<=0.05:
            return pairs[1]
    elif p1<=0.05 :
        return pairs[0]
    else:
        return False

In [None]:
def RFE_correlation(X,Y):
    X_corr=np.triu(np.abs(np.around(X.corr(),decimals=1)),k=1)
    X_Y_corr=np.triu(np.abs(np.around(pd.concat([X,Y]).corr(),decimals=1)),k=1)
    correlated_x,correlated_y=np.where(X_corr>0.8)
    correlated_features=set([ (list(X.corr().index)[a[0]],list(X.corr().columns)[a[1]]) for a in list(zip(correlated_x,correlated_y))])
    print('correlated features are',correlated_features)
    print('len of correlated features',len(correlated_features))
    print('--------------------------------------------------------------')
    if len(correlated_features)>=1:
        for a in correlated_features:
            print('current pair is ',a)
            result=fn_calc_anova_pairs(a)
            if result!=False:
                print('dropping ',result)
                X.drop(result,axis=1,inplace=True)
            else:
                print('dropping ',list(a))
                X.drop(list(a),axis=1,inplace=True)
            return RFE_correlation(X,Y)
    else:
        print('no correlated features',correlated_features)
        return X
        
                
            
            
        
        
        
    
    

In [None]:
finaldata=RFE_correlation(X1,y1)
#finaldata has a list of uncorrelated features

In [None]:
#lets fit the final data to a model and check the performance
rf=RandomForestClassifier()
trainx,testx,trainy,testy=train_test_split(finaldata,y,test_size=0.25)

scores=cross_val_score(rf,trainx,trainy,cv=5)

In [None]:
scores.mean()

In [None]:
#we were able to reduce the feature size without compromising a lot of accuracy

In [None]:
finaldata.columns

# selectKbest and chi2

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [None]:
trainx,testx,trainy,testy=train_test_split(X,y,test_size=0.25)
kb=SelectKBest(chi2,k=5).fit(trainx,trainy)

In [None]:
kb.scores_

In [None]:
trainx.columns

In [None]:
sorted({a:b for a,b in list(zip(trainx.columns,kb.scores_))}.items(),key=lambda item:item[1],reverse=True)

# Recursive Feature elimination

In [None]:
from sklearn.feature_selection import RFE

In [None]:
#lets store the diagnosis as y
y=data['diagnosis'].copy()
X=data.drop(['id','Unnamed: 32','diagnosis'],axis=1).copy()

In [None]:
rf3=RandomForestClassifier(n_estimators=10)
rfe=RFE(rf3,10,1)
rfe.fit(X,y)

In [None]:
rfe.support_

In [None]:
sorted({b:a for a,b in list(zip(X.columns,rfe.ranking_))}.items(),key=lambda a:a[0])

In [None]:
rfe.ranking_

# RFECV

In [None]:
from sklearn.feature_selection import RFECV

In [None]:
rf4=RandomForestClassifier()
rfecv=RFECV(estimator=rf4,step=1,cv=5,scoring='accuracy')
rfecv.fit(X,y)

In [None]:
rfecv.n_features_

In [None]:
X.columns[rfecv.support_]

In [None]:
rfecv.grid_scores_

In [None]:
plt.figure(figsize=(15,8))
plt.plot(range(1,len(rfecv.grid_scores_)+1),rfecv.grid_scores_)
#plt.annotate(list(X.columns[rfecv.support_]),(range(1,len(rfecv.grid_scores_)+1),rfecv.grid_scores_))
plt.grid()

# Tree Based Feature selection

In [None]:
rf5=RandomForestClassifier()
#lets store the diagnosis as y
y=data['diagnosis'].copy()
X=data.drop(['id','Unnamed: 32','diagnosis'],axis=1).copy()
rf5.fit(X,y)



In [None]:
importances=rf5.feature_importances_

In [None]:
std=np.std([tree.feature_importances_ for tree in rf5.estimators_],axis=0)

In [None]:
importances

In [None]:
X.columns[np.argsort(importances)[::-1]]

In [None]:
list(zip(X.columns[np.argsort(importances)[::-1]],importances[np.argsort(importances)[::-1]]))

In [None]:
plt.figure(figsize=(15,8))
plt.bar(X.columns[np.argsort(importances)[::-1]],importances[np.argsort(importances)[::-1]],yerr=std[np.argsort(importances)[::-1]])
plt.xticks(rotation=90)