# getting the data, initial processing :

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df = df.drop('id',axis=1)

In [None]:
df = df.drop('Unnamed: 32',axis=1)

In [None]:
df.head()

*problem statement : "given the features of the tumor, predict the diagnosis i.e. malignant(m) or begnin(b)"*

# exploratory data analysis :

In [None]:
import matplotlib.pyplot as plt 

import seaborn as sns

In [None]:
plt.figure(figsize=(16,8))

sns.kdeplot(df['texture_mean'])

In [None]:
plt.figure(figsize=(16,8))

sns.kdeplot(df['area_mean'])

In [None]:
plt.figure(figsize=(10,10))
plt.xlabel('mean_radius')
plt.ylabel('mean_area')
sns.scatterplot(x = df['radius_mean'], y = df['area_mean'], hue = df['diagnosis'])

*the data shows that, smaller the radius more likely is the tumor to be non-cancerous*

In [None]:
plt.figure(figsize=(10,10))
plt.xlabel('mean concavity')
plt.ylabel('mean compactness')
sns.scatterplot(x = df['concavity_mean'], y = df['compactness_mean'], hue = df['diagnosis'])

In [None]:
plt.figure(figsize=(10,10))
plt.xlabel('mean_texture')
plt.ylabel('mean_dimension')
sns.scatterplot(x = df['texture_mean'], y = df['fractal_dimension_mean'], hue = df['diagnosis'])

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(df.corr(),cmap='magma')

*most of the dimensions and measurement is of medical relevance, as i have no background in medical science i can't study the data that well so as to plot out curves that are of importance*

# coverting categorical variables into binary classes :

In [None]:
def func(x):
    if x == 'M':
        return 1
    else:
        return 0
df['diagnosis'] = df['diagnosis'].apply(func)

In [None]:
df.head()

In [None]:
X = df[df.columns[1:]]
y = df[df.columns[0]]

*we just isolated the labels and the input data for our models from each other !*

# scaling and splitting the data into training and test examples :

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X,y,random_state=101,test_size=0.33)

In [None]:
scaler = StandardScaler()

scaled_train = scaler.fit_transform(x_train)

scaled_test = scaler.transform(x_test)

In [None]:
scaled_train.shape

In [None]:
scaled_test.shape

# using logistic regression :

In [None]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()

lr_model.fit(scaled_train,y_train)

preds = lr_model.predict(scaled_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
print(confusion_matrix(y_test,preds))

In [None]:
print(classification_report(y_test,preds))

# using PCA for dimensionality reduction :

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 3)

In [None]:
reduced_train = pca.fit_transform(scaled_train)

reduced_test = pca.transform(scaled_test)

In [None]:
reduced_train.shape

In [None]:
sns.scatterplot(x=reduced_train[:,0],
            y=reduced_train[:,1],hue=y_train)

In [None]:
sns.scatterplot(x=reduced_train[:,0],
            y=reduced_train[:,2],hue=y_train)

In [None]:
sns.scatterplot(x=reduced_train[:,1],
            y=reduced_train[:,2],hue=y_train)

# using support vector classifier :

In [None]:
from sklearn.svm import SVC

In [None]:
svc = SVC() 

In [None]:
svc.fit(reduced_train,y_train)

In [None]:
preds = svc.predict(reduced_test)

In [None]:
print(classification_report(y_test,preds))

In [None]:
print(confusion_matrix(y_test,preds))

# using random forest classifier :

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
accuracy = []
i_vals = [] 
for i in range(1,100):
    
    forest = RandomForestClassifier(n_estimators=i)
    
    forest.fit(scaled_train,y_train)
    
    preds = forest.predict(scaled_test)
    
    accuracy.append(accuracy_score(y_test,preds))
    i_vals.append(i)

In [None]:
plt.figure(figsize=(14,3))
plt.plot(i_vals, accuracy)

In [None]:
forest = RandomForestClassifier(n_estimators=40)
    
forest.fit(scaled_train,y_train)
    
preds = forest.predict(scaled_test)

print(classification_report(y_test,preds))

In [None]:
accuracy = []
i_vals = [] 
for i in range(1,100):
    
    forest = RandomForestClassifier(n_estimators=i)
    
    forest.fit(reduced_train,y_train)
    
    preds = forest.predict(reduced_test)
    
    accuracy.append(accuracy_score(y_test,preds))
    i_vals.append(i)

In [None]:
plt.figure(figsize=(14,3))
plt.plot(i_vals, accuracy)

In [None]:
forest = RandomForestClassifier(n_estimators=35)
    
forest.fit(reduced_train,y_train)
    
preds = forest.predict(reduced_test)

print(classification_report(y_test,preds))

# constructing neural network for classification purpose :

In [None]:
from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import Dense, Dropout

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
scaled_test.shape

In [None]:
model = Sequential()

model.add(Dense(30, activation='relu'))

model.add(Dense(15, activation='relu'))

model.add(Dense(7, activation='relu'))

model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [None]:
stop = [EarlyStopping(patience=100, monitor = 'val_acc')]

model.fit(scaled_train,
          y_train,
          epochs = 600,
          callbacks=stop,
          validation_data=(scaled_test,y_test))

In [None]:
plt.figure(figsize=(15,8))
pd.DataFrame(model.history.history).plot()

In [None]:
preds = model.predict_classes(scaled_test)

In [None]:
print(classification_report(y_test,preds))

In [None]:
model = Sequential()

model.add(Dense(3, activation='relu'))

model.add(Dense(8, activation='relu'))

model.add(Dense(2, activation='relu'))

model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [None]:
stop = [EarlyStopping(patience=100, monitor = 'val_acc')]

model.fit(reduced_train,
          y_train,
          epochs = 600,
          callbacks=stop,
          validation_data=(reduced_test,y_test))

In [None]:
preds = model.predict_classes(reduced_test)

In [None]:
print(classification_report(y_test,preds))