In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns  
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv') 
data.head()

In [None]:
col= data.columns
print(col)
col.size

In [None]:
y = data.diagnosis                          # M or B 
list = ['Unnamed: 32','id','diagnosis']
x = data.drop(list,axis = 1 )
x.head()

In [None]:
y.head()

In [None]:
ax = sns.countplot(y,label="Count")       # M = 212, B = 357
B, M = y.value_counts()
print('Number of Benign: ',B)
print('Number of Malignant : ',M)

In [None]:
data_dia = y
data = x
data_n_2 = (data - data.mean()) / (data.std())          
data = pd.concat([y,data_n_2.iloc[:,0:10]],axis=1)
data = pd.melt(data,id_vars="diagnosis",
                    var_name="features",
                    value_name='value')
plt.figure(figsize=(10,10))
sns.violinplot(x="features", y="value", hue="diagnosis", data=data,split=True, inner="quart")
plt.xticks(rotation=90)

In [None]:
drop_list1 = ['perimeter_mean','radius_mean','compactness_mean','concave points_mean','radius_se','perimeter_se','radius_worst','perimeter_worst','compactness_worst','concave points_worst','compactness_se','concave points_se','texture_worst','area_worst']
x_1 = x.drop(drop_list1,axis = 1 )       
x_1.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_1, y, test_size=0.3, random_state=42)

##logical Regression

In [None]:
logreg = LogisticRegression(C=10)     
logreg = logreg.fit(x_train,y_train)

ac = accuracy_score(y_test,logreg.predict(x_test))
print('Accuracy is: ',ac)
cm = confusion_matrix(y_test,logreg.predict(x_test))
sns.heatmap(cm,annot=True,fmt="d")

##RFE on LR

In [None]:
from sklearn.feature_selection import RFE
# Create the RFE object and rank each pixel
logreg_2 = LogisticRegression()      
rfe = RFE(estimator=logreg_2, n_features_to_select=5, step=1)
rfe = rfe.fit(x_train, y_train)
print('Chosen best 5 feature by rfe:',x_train.columns[rfe.support_])

x_train_3 = rfe.transform(x_train)
x_test_3 = rfe.transform(x_test)
logreg_2 = LogisticRegression()      
logreg_2 = logreg_2.fit(x_train_3,y_train)
ac_3 = accuracy_score(y_test,logreg_2.predict(x_test_3))
print('Accuracy is: ',ac_3)

##RFECV on LR

In [None]:
from sklearn.feature_selection import RFECV

# The "accuracy" scoring is proportional to the number of correct classifications
logreg_3 = LogisticRegression()
rfecv = RFECV(estimator=logreg_3, step=1, cv=5,scoring='accuracy')   #5-fold cross-validation
rfecv = rfecv.fit(x_train, y_train)

print('Optimal number of features :', rfecv.n_features_)
print('Best features :', x_train.columns[rfecv.support_])

In [None]:
import matplotlib.pyplot as plt
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score of number of selected features")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
max(rfecv.grid_scores_)
rfecv.grid_scores_

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# find best scored 5 features
select_feature = SelectKBest(chi2, k=5).fit(x_train, y_train)
print('Score list:', select_feature.scores_)
print('Feature list:', x_train.columns)

In [None]:
x_train_2 = select_feature.transform(x_train)
x_test_2 = select_feature.transform(x_test)
#random forest classifier with n_estimators=10 (default)
logreg_4 = LogisticRegression()     
logreg_4 = logreg_4.fit(x_train_2,y_train)
ac_2 = accuracy_score(y_test,logreg_4.predict(x_test_2))
print('Accuracy is: ',ac_2)

##Decision tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=43)    
dt = dt.fit(x_train,y_train)

ac = accuracy_score(y_test,dt.predict(x_test))
print('Accuracy is: ',ac)
cm = confusion_matrix(y_test,dt.predict(x_test))
sns.heatmap(cm,annot=True,fmt="d")

##RFE on decision tree

In [None]:
from sklearn.feature_selection import RFE
# Create the RFE object and rank each pixel
dt_2 = DecisionTreeClassifier()      
rfe = RFE(estimator=dt_2, n_features_to_select=5, step=1)
rfe = rfe.fit(x_train, y_train)
print('Chosen best 5 feature by rfe:',x_train.columns[rfe.support_])

x_train_3 = rfe.transform(x_train)
x_test_3 = rfe.transform(x_test)
dt_2 = DecisionTreeClassifier()      
dt_2 = dt_2.fit(x_train_3,y_train)
ac_3 = accuracy_score(y_test,dt_2.predict(x_test_3))
print('Accuracy is: ',ac_3)

##RFECV on DT

In [None]:
from sklearn.feature_selection import RFECV

# The "accuracy" scoring is proportional to the number of correct classifications
dt_3 = DecisionTreeClassifier() 
rfecv = RFECV(estimator=dt_3, step=1, cv=5,scoring='accuracy')   #5-fold cross-validation
rfecv = rfecv.fit(x_train, y_train)

print('Optimal number of features :', rfecv.n_features_)
print('Best features :', x_train.columns[rfecv.support_])

In [None]:
import matplotlib.pyplot as plt
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score of number of selected features")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
max(rfecv.grid_scores_)
rfecv.grid_scores_

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# find best scored 5 features
select_feature = SelectKBest(chi2, k=5).fit(x_train, y_train)
print('Score list:', select_feature.scores_)
print('Feature list:', x_train.columns)

In [None]:
x_train_2 = select_feature.transform(x_train)
x_test_2 = select_feature.transform(x_test)
#random forest classifier with n_estimators=10 (default)
dt_4 = DecisionTreeClassifier()    
dt_4 = dt_4.fit(x_train_2,y_train)
ac_2 = accuracy_score(y_test,dt_4.predict(x_test_2))
print('Accuracy is: ',ac_2)

##SVM

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

# We define the SVM model
svm = OneVsRestClassifier(BaggingClassifier(SVC(C=10,kernel='rbf',random_state=9, probability=True), 
                                               n_jobs=-1))
svm = svm.fit(x_train,y_train)

ac = accuracy_score(y_test,svm.predict(x_test))
print('Accuracy is: ',ac)
cm = confusion_matrix(y_test,svm.predict(x_test))
sns.heatmap(cm,annot=True,fmt="d")

##KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier


knncla = KNeighborsClassifier(n_neighbors=5,n_jobs=-1)
knncla.fit(x_train, y_train)

ac = accuracy_score(y_test,knncla.predict(x_test))
print('Accuracy is: ',ac)
cm = confusion_matrix(y_test,knncla.predict(x_test))
sns.heatmap(cm,annot=True,fmt="d")

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# find best scored 5 features
select_feature = SelectKBest(chi2, k=5).fit(x_train, y_train)
print('Score list:', select_feature.scores_)
print('Feature list:', x_train.columns)

In [None]:
x_train_2 = select_feature.transform(x_train)
x_test_2 = select_feature.transform(x_test)
#random forest classifier with n_estimators=10 (default)
knncla_2 = KNeighborsClassifier()    
knncla_2 = knncla_2.fit(x_train_2,y_train)
ac_2 = accuracy_score(y_test,knncla_2.predict(x_test_2))
print('Accuracy is: ',ac_2)