In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.columns

In [None]:
df['diagnosis'] = np.where(df.diagnosis == 'M',1,0)

In [None]:
#df.head()

In [None]:
df.diagnosis.value_counts()

In [None]:
df.corr()['diagnosis'].sort_values(ascending=False)

In [None]:
df = df.drop(columns=['id','Unnamed: 32'],axis=1)

In [None]:
df.describe()

In [None]:
df.corr()

# **DATA VISUALIZATION**

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly.express as px

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(),annot=True,cmap='YlGnBu',center=0.4)

In [None]:
columns = ['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']

sns.pairplot(df[columns], hue="diagnosis", palette='mako')

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(x='diagnosis',data=df)

In [None]:
fig = px.scatter_matrix(df, dimensions=['radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean'], color="diagnosis")
fig.show()

In [None]:
fig = px.scatter(df, x='smoothness_mean', y='compactness_mean', color="diagnosis",
            log_x=True, size_max=60)
fig.show()

In [None]:
fig = px.area(df, x="radius_worst", y="texture_worst", color="diagnosis")
fig.show()

In [None]:
fig = px.histogram(df, x="perimeter_se", y="area_se", color="diagnosis", marginal="rug", hover_data=df.columns)
fig.show()

In [None]:
fig = px.density_contour(df, x="concave points_worst", y="perimeter_worst",color="diagnosis", marginal_x="rug", marginal_y="histogram")
fig.show()

# **FEATURE SELECTION**

In [None]:
df.columns

In [None]:
X =df.drop('diagnosis',axis=1)


y = df.diagnosis

In [None]:
## using pearson correlation 
plt.figure(figsize=(20,20))
cor = X.corr()
sns.heatmap(cor,annot=True,cmap = plt.cm.CMRmap_r)

In [None]:
## with the following function we can select highly correlated features

def correlation(dataset,threshold):
    col_corr = set() # set of names of all the columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i,j]) > threshold:# we r intrested in coeff value
                col_name = corr_matrix.columns[i] # getting the name of column
                col_corr.add(col_name)
    return col_corr
                

In [None]:
corr_features = correlation(X,0.9)
len(set(corr_features))

In [None]:
corr_features

In [None]:
X = X.drop(corr_features,axis=1)

In [None]:
X.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
from sklearn.feature_selection import mutual_info_classif

from sklearn.feature_selection import SelectKBest

In [None]:
mutual_info = mutual_info_classif(X_train,y_train)
mutual_info

In [None]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train.columns
mutual_info.sort_values(ascending=False)

In [None]:
mutual_info.sort_values(ascending=False).plot.bar(figsize=(20,13))

In [None]:
select_15_best = SelectKBest(mutual_info_classif,k=15)
select_15_best.fit(X_train,y_train)
cols = X_train.columns[select_15_best.get_support()]


In [None]:
X_train = X_train[cols] 
X_test = X_test[cols]

# **FEATURE SCALING**

In [None]:
from sklearn.preprocessing import StandardScaler 
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.fit_transform(X_test)

 # **TRAINING AND PREDICTIONS** 

 #  1. USING STRATIFIED K_FOLD

In [None]:
from sklearn.linear_model import SGDClassifier
sgd_cal = SGDClassifier()

In [None]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(sgd_cal,X_train,y_train,cv=10)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
score

In [None]:
score.mean()

In [None]:
X_train.shape,y_train.shape

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
accuracy = []
skfl = StratifiedKFold(n_splits=10,random_state=None)
skfl.get_n_splits(X_train,y_train)
for train_index,test_index in skfl.split(X_train,y_train):
    print('train:',train_index,'validation:',test_index)
    X1_train , X1_test = X_train[train_index] , X_train[test_index]
    y1_train , y1_test = y_train.iloc[train_index] , y_train.iloc[test_index]
    
    sgd_cal.fit(X1_train,y1_train)
    prediction = sgd_cal.predict(X1_test)
    score = accuracy_score(prediction,y1_test)
    accuracy.append(score)

In [None]:
np.array(accuracy).mean()

# 2. USING LOGISTIC REGRESSION

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(X_train,y_train)

In [None]:
prediction = lr.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [None]:
print(classification_report(y_test,prediction))

In [None]:
sns.heatmap(confusion_matrix(y_test,prediction),annot=True)

In [None]:
print(accuracy_score(y_test,prediction))

# 3. USING SGD CLASSIFIER

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
sgd_cal = SGDClassifier()

In [None]:
sgd_cal.fit(X_train,y_train)

In [None]:
predictions = sgd_cal.predict(X_test)

In [None]:
print(classification_report(y_test,predictions))

In [None]:
sns.heatmap(confusion_matrix(y_test,predictions),annot=True,cmap='YlGnBu')

In [None]:
print(accuracy_score(y_test,predictions))