# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

The first two columns in the dataset store the unique ID numbers of the samples and the corresponding diagnosis (M=malignant, B=benign), respectively.

The columns 3-32 contain 30 real-value features that have been computed from digitized images of the cell nuclei, which can be used to build a model to predict whether a tumor is benign or malignant.

     1= Malignant (Cancerous) - Present (M)
     0= Benign (Not Cancerous) -Absent (B)
Ten real-valued features are computed for each cell nucleus:
1)radius (mean of distances from center to points on the perimeter)
2)texture (standard deviation of gray-scale values)
3)perimeter
4)area
5)smoothness (local variation in radius lengths)
6)compactness (perimeter^2 / area - 1.0)
7)concavity (severity of concave portions of the contour)
8)concave points (number of concave portions of the contour)
9)symmetry
10)fractal dimension ("coastline approximation" - 1)
The mean, standard error and "worst" or largest (mean of the three largest values) of these features were computed for each image, resulting in 30 features. For instance, field 3 is Mean Radius, field 13 is Radius SE, field 23 is Worst Radius.

# Importing the Data

In [None]:
df = pd.read_csv(" Breast Cancer Wisconsin (Diagnostic) Data Set.csv")
df

In [None]:
df.head()

#The last column named "Unaname: 32" seems like an erronous column in our dataset. We might probably just drop it.
#Most of the columns seem to have a numeric entry. This would save our time from mapping the variables.
#The ID column would not help us contributing to predict about the cancer. We might as well drop it.

In [None]:
##renaming the column names

In [None]:
df.rename(columns = {"concave points_mean":"concave_points_mean","concave points_se":"concave_points_se","concave points_worst":"concave_points_worst"},inplace = True)

In [None]:
##checking for null values in columns

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
##removing the null column and unwanted column

In [None]:
df.drop("Unnamed: 32",axis=1,inplace=True)
df.drop("id",axis = 1, inplace = True)

#After dropping the two columns, we are now left with 31 columns

In [None]:
##using label encoder to encode the "diagnosis" column

In [None]:
label_encoder = LabelEncoder()

In [None]:
df["diagnosis"] = label_encoder.fit_transform(df["diagnosis"])

In [None]:
df

#In order to conduct our analysis easily, we have converted the target column as:
#Malignant - 1
#Benignant - 0

In [None]:
df.diagnosis.value_counts()

In [None]:
sns.countplot(df['diagnosis'], label = 'count')

In [None]:
df.plot(kind='density', subplots=True, layout=(5,7), sharex=False, legend=False, fontsize=1)
plt.show()

#we can see that there is a guassian distribution

In [None]:
plt.figure(figsize=(18,18))
sns.heatmap(df.corr(),annot = True,fmt = '.0%')

In [None]:
corr = df.corr()
corr[abs(corr['diagnosis']) > 0.59].index

#The above columns are the one's that show the greatest correlation with our diagnosis column.

In [None]:
df.drop('diagnosis', axis=1).corrwith(df.diagnosis).plot(kind='bar', grid=True, figsize=(12, 10), title="Correlation with target",color="green");

#There are only a handful of columns that show negative correlation with the 'diagnosis column'
#Around half of our columns are more than 50% positively correlated to diagnosis column.
#We have to select which of the attributes we want to use in building our model!

In [None]:
corr_matrix = df.corr()
threshold = 0.60 
filtre = np.abs(corr_matrix["diagnosis"]) > threshold
corr_features = corr_matrix.columns[filtre].tolist()
sns.clustermap(df[corr_features].corr(), annot = True, cmap="YlGnBu")
plt.title("Correlation Between Features w Corr Theshold 0.60", fontweight = "bold", fontsize=16)
plt.show()

# Splitting The Data

In [None]:
X = df.drop(columns =["diagnosis"])
X

In [None]:
y = df.iloc[:,:1]
y

# Feature Selection

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

In [None]:
select = ExtraTreesClassifier(random_state = 42)
select.fit(X,y)
 
imp_feature = select.feature_importances_

for index,val in enumerate(imp_feature):
    print(index,round(val*100,2))
    
print(X.info())

#I have used ExtraTreesClassifier to select k-best features and selected the 8 best features as below.

In [None]:
prediction_features =["radius_worst","perimeter_worst","concave_points_worst","area_worst","concave_points_mean","radius_mean","concavity_mean","area_mean"] 
x=df[prediction_features]
x

# Scaling the data

In [None]:
sc = StandardScaler()
x_scaled = sc.fit_transform(x)

In [None]:
x_scaled

#Converting the scaled data into dataframe

In [None]:
x_t = pd.DataFrame(data = x_scaled, columns = x.columns)
x_t

In [None]:
x_t.describe()

# Train_Test_Split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x_t,y,test_size = 0.3 ,random_state =2)

In [None]:
x_test

# Model Fitting

# Logistic Regression

In [None]:
LR = LogisticRegression()

In [None]:
p=LR.fit(x_train,y_train)
p

In [None]:
y_pred = LR.predict(x_test)
y_pred

In [None]:
cm = confusion_matrix(y_test,y_pred)
print(cm)

In [None]:
LR_acc =accuracy_score(y_test,y_pred)
print(LR_acc)

# KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors = 2, weights ='uniform')

In [None]:
knn.fit(x_train, y_train)

In [None]:
y_pred = knn.predict(x_test)

In [None]:
knn_acc=accuracy_score(y_test,y_pred)
print(knn_acc)

In [None]:
cm = confusion_matrix(y_test,y_pred)
print(cm)

# Decision Tree classifer

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(random_state=10)

In [None]:
clf = clf.fit(x_train,y_train)

In [None]:
y_pred = clf.predict(x_test)

In [None]:
clf_acc=accuracy_score(y_test,y_pred)
print(clf_acc)

In [None]:
cm = confusion_matrix(y_test,y_pred)
print(cm)

# Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=60, random_state=0)
# Train the model on training data
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)

In [None]:
rf_acc=accuracy_score(y_test,y_pred)
print(rf_acc)

In [None]:
cm = confusion_matrix(y_test,y_pred)
print(cm)

# Accuracies of Models

In [None]:
print(LR_acc)
print(knn_acc)
print(clf_acc)
print(rf_acc)

In [None]:
plt.figure(figsize=(10,5))
model_acc = [LR_acc, knn_acc,clf_acc, rf_acc]
model_name = ['LogisticRegression', 'KNeighbors Classifier','Decision Tree classifer', 'Random Forest']
sns.barplot(x= model_acc, y=model_name, palette='magma')

#We can clearly see that all our models perform with more than 90% accuracy
 where DecisionTreeClassifier has the lowest of 91.22% and LogisticRegression has the highest of 95.32% accuracy.

# Pickling files

In [None]:
import pickle

In [None]:
pickle.dump(p, open('model.pkl', 'wb'))

In [None]:
my_dict = pickle.load(open('./model.pkl', 'rb'))
my_dict