In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sbn



In [None]:
from matplotlib.pylab import rcParams
rcParams["figure.figsize"] = 10,6


 # Loading of Dataset

In [None]:
bCancerDf = pd.read_csv("breast-cancer_csv.csv",na_values=["?"])
bCancerDf.head(10)



In [None]:
##Age to Ordinal Data
##menopause to Categorical
##Tumor size ordinal
##inv-nodes to ordinal
##Node-caps to ordinal
##deg-malig to ordinal
##breast to categorical
##irradiat to categorical



In [None]:
bCancerDf.shape


 # Checking For Null Values

In [None]:
bCancerDf.isna().sum()


 # Decriptive Statistics of Data For Numeric Columns

In [None]:
bCancerDf.describe(include=np.object)



In [None]:
bCancerDf.describe(include=np.number)


 # Fix Some Columns With Values Not In Right Format

 Age,Tumor Nodes & Tumor Size were in ranges. So to help simplify things we computed the numerical averages for them.Even though this might affect the data we dont think it will be that have that much impact..

In [None]:
def toInt(data,col_name):
    new_data = []
    for val in data:
        high,low = val.split("-")
        new_val = np.ceil((int(high) + int(low))/2)
        new_data.append(new_val)
    return pd.DataFrame(new_data,columns=[col_name])



In [None]:
tobe_fixed = ["age","tumor-size","inv-nodes"]
new_dframes = []
for col in tobe_fixed:
    column = bCancerDf[col].values
    new_dframes.append(toInt(column,col))
final_bCancerDf = pd.concat(new_dframes,axis=1)



In [None]:
final_bCancerDf.head()


 # Dropping Of Breast-Quad Column

 On Investigation, we found that breast-quadrants were incorrectly assigned to "RIGHT" even though the left breast was impacted.

In [None]:
bCancerDf = bCancerDf.drop("breast-quad",axis=1)


 # Fixing Missing Values

In [None]:
from sklearn.impute import SimpleImputer



In [None]:
myImputer = SimpleImputer(strategy="most_frequent")



In [None]:
bCancerDf.loc[:,["node-caps"]] = myImputer.fit_transform(bCancerDf.loc[:,["node-caps"]])



 # Converting Categorical Columns To Numeric Columns

In [None]:
breast =  pd.get_dummies(data=bCancerDf.breast,prefix="breast",drop_first=True)
menopause = pd.get_dummies(data=bCancerDf.menopause,prefix="menopause",drop_first=True)
irradiat = pd.get_dummies(data=bCancerDf.irradiat,prefix="irradiat",drop_first=True)
node_caps = pd.get_dummies(data=bCancerDf["node-caps"],prefix="node-caps",drop_first=True)



In [None]:
final_bCancerDf = pd.concat([final_bCancerDf,menopause,node_caps,breast,irradiat,bCancerDf["deg-malig"],bCancerDf["Class"]],1)



In [None]:
from sklearn.preprocessing import LabelBinarizer



In [None]:
labeler = LabelBinarizer()



In [None]:
final_bCancerDf.iloc[:,[-1]] = labeler.fit_transform(final_bCancerDf.iloc[:,[-1]])



In [None]:
final_bCancerDf.head()



 # Scaling of Features Excluding The Target to Deal With Euclidean Distance

In [None]:
from sklearn.preprocessing import RobustScaler



In [None]:
scaler = RobustScaler()



In [None]:
final_bCancerDf.iloc[:,:-1] = scaler.fit_transform(final_bCancerDf.iloc[:,:-1])



In [None]:
final_bCancerDf.head()


 # Checking For Correlation

In [None]:
sbn.heatmap(final_bCancerDf.corr(),annot=True,linewidths=.5,robust=True)


# Preparing Of Data For Traing And Testing

In [None]:
from sklearn.model_selection import train_test_split



In [None]:
Xtrain,Xtest,Ytrain,Ytest = train_test_split(final_bCancerDf.iloc[:,:-1].values,final_bCancerDf.iloc[:,-1].values.ravel(),random_state=42)


# Supervised Ml

In [None]:
from sklearn.metrics import confusion_matrix

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Using The GridSearchCv To Find the Best Parameters For Our RandomForestClassifier

In [None]:
forestClassifier = RandomForestClassifier()

In [None]:
parameters= [{"n_estimators":[10,15,20],
    "criterion":['entropy'],
    "random_state":[32,42],
    "class_weight":["balanced"]},{"n_estimators":[10,15,20],
    "criterion":['entropy'],
    "random_state":[32,42],
    "class_weight":["balanced_subsample"]}]

In [None]:
gridSearch = GridSearchCV(estimator=forestClassifier,param_grid=parameters,scoring="accuracy",cv=17,n_jobs=-1)

In [None]:
gridSearch.fit(Xtrain,y=Ytrain)

In [None]:
best_score = gridSearch.best_score_
bes_params = gridSearch.best_params_

In [None]:
bes_params

In [None]:
best_score

In [None]:
grid_pred = gridSearch.predict(Xtest)

In [None]:
grid_cnn = confusion_matrix(Ytest,grid_pred)
grid_cnn

# Using Naive Bayes

In [None]:
naiveClassifier = GaussianNB()

In [None]:
naiveClassifier.fit(Xtrain,Ytrain)

In [None]:
naiveClassifier.score(Xtest,Ytest)


In [None]:
naive_cnn = confusion_matrix(Ytest,naiveClassifier.predict(Xtest))
naive_cnn

 # Setup For Deep Learning Model

In [None]:
from keras.models import Sequential
from keras.layers import Dense,Dropout,BatchNormalization
from keras.activations import relu,sigmoid
from keras.optimizers import Adam



In [None]:
cancerRecallModel = Sequential()



In [None]:
cancerRecallModel.add(Dense(150,activation=relu,input_dim=9))
cancerRecallModel.add(Dropout(0.5))
cancerRecallModel.add(BatchNormalization())
cancerRecallModel.add(Dense(100,activation=relu))
cancerRecallModel.add(Dropout(0.5))
cancerRecallModel.add(BatchNormalization())
cancerRecallModel.add(Dense(80,activation=relu))
cancerRecallModel.add(Dropout(0.5))
cancerRecallModel.add(BatchNormalization())
cancerRecallModel.add(Dense(80,activation=relu))
cancerRecallModel.add(Dropout(0.5))
cancerRecallModel.add(BatchNormalization())
cancerRecallModel.add(Dense(50,activation=relu))
cancerRecallModel.add(BatchNormalization())
cancerRecallModel.add(Dense(1,activation=sigmoid))
cancerRecallModel.summary()



In [None]:
cancerRecallModel.compile(Adam(lr=0.001),loss="binary_crossentropy",metrics=["accuracy"])


 # Running Of Model

In [None]:
cancerRecallModel.fit(x=Xtrain,y=Ytrain,epochs=100,validation_split=0.2)


 # Checks To See How Well Model Did

In [None]:
cancerRecallModel.history.history.keys()



In [None]:
validation_loss = np.mean(cancerRecallModel.history.history["val_loss"])
validation_acc = np.mean(cancerRecallModel.history.history["val_acc"])
train_loss = np.mean(cancerRecallModel.history.history["loss"])
train_acc = np.mean(cancerRecallModel.history.history["acc"])



In [None]:
validation_acc



In [None]:
validation_loss



In [None]:
train_loss



In [None]:
train_acc



In [None]:
pred_classes = cancerRecallModel.predict_classes(Xtest)



In [None]:
cnn =confusion_matrix(Ytest,pred_classes)



In [None]:
cnn

