# 1.Importing library , packages and themes

In [None]:
# manipulation data

import pandas as pd
import numpy as np

				#visualiation data

# 1) matplotlib & seaborn 

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns 

				#default theme
sns.set(context='notebook', style='darkgrid', palette='colorblind', font='sans-serif', font_scale=1, rc=None)
matplotlib.rcParams['figure.figsize'] =[8,8]
matplotlib.rcParams.update({'font.size': 15})

import warnings
warnings.filterwarnings("ignore")


# 2.Read data & data analysis

#### PS1: first we gonna read our data then delete the id columns because it is usless

In [None]:
df = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
df = df.drop('id',axis=1)
df.head()

In [None]:
print(df.columns.to_list())

Attribute Information:

1. Diagnosis (M = malignant, B = benign)

2-31)

Ten real-valued features are computed for each cell nucleus:

    a) radius (mean of distances from center to points on the perimeter)
    b) texture (standard deviation of gray-scale values)
    c) perimeter
    d) area
    e) smoothness (local variation in radius lengths)
    f) compactness (perimeter^2 / area - 1.0)
    g) concavity (severity of concave portions of the contour)
    h) concave points (number of concave portions of the contour)
    i) symmetry
    j) fractal dimension ("coastline approximation" - 1)

PS:

The mean, standard error and "worst" or largest (mean of the three
largest values) of these features were computed for each image,
resulting in 30 features. For instance, field 3 is Mean Radius, field
13 is Radius SE, field 23 is Worst Radius.

All feature values are recoded with four significant digits.

Missing attribute values: none

Class distribution: 357 benign, 212 malignant

In [None]:
df.info()

#### PS2 : as we can see that the last columns is totaly empty so I gonna drop this columns   

In [None]:
df=df.drop('Unnamed: 32',axis=1)

In [None]:
print('We have ',df.shape[0],'Rows & ',df.shape[1],' Columns')

In [None]:
df.describe(include='all')

### What can we see at first sight?
1. target values : 
    * B = benign is the most frequent value with 357
    * M = malignant 569-357 = 212
       

# 3. data cleaning 

## A. missing values

In [None]:
miss = df.isnull().sum()
mis_perncent = miss /df.shape[0]*100

data = {
    'missing data':miss,
    'missing data %':mis_perncent,
    'data type':df.dtypes
}

miss_tab = pd.DataFrame(data)
miss_tab

### as we can see there is no missing value 

## B. Duplicate values

In [None]:
# get duplicate Rows
dup = df.duplicated()
df[dup]

### there is no duplicate values

# 5. Data visualization

## A. histogram of data

In [None]:
df.hist(figsize=(15,20),edgecolor='black',bins=30)
plt.show()

In [None]:
sns.countplot(x='diagnosis',data=df)
plt.show()

## B. boxplot

### a. standardization

In [None]:
y = df.diagnosis                          # M or B 
x = df.drop('diagnosis',axis = 1 )
x.head()
data_dia = y
data = x
data_n_2 = (data - data.mean()) / (data.std())              
data = pd.concat([y,data_n_2.iloc[:,0:10]],axis=1)
data = pd.melt(data,id_vars="diagnosis",
                    var_name="features",
                    value_name='value')

### b. violinplot

In [None]:
plt.figure(figsize=(10,10))
sns.violinplot(x="features", y="value", data=data)
plt.xticks(rotation=90);

In [None]:
plt.figure(figsize=(10,10))
sns.violinplot(x="features", y="value", hue="diagnosis", data=data)
plt.xticks(rotation=90);

### c. boxplot

In [None]:
plt.figure(figsize=(10,10))
sns.boxplot(x="features", y="value", data=data)
plt.xticks(rotation=90);

In [None]:
plt.figure(figsize=(10,10))
sns.boxplot(x="features", y="value", hue="diagnosis", data=data)
plt.xticks(rotation=90);

# 6. features selection 

In [None]:
# important features
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(x,y)
feat_imp = pd.Series(model.feature_importances_,index=x.columns)
feat_imp.nlargest(15).plot(kind='barh')
plt.title('the most  important deature are')
plt.show()

## data transformation

In [None]:
from sklearn.preprocessing import LabelEncoder
e = LabelEncoder()
df.diagnosis =e.fit_transform(df.diagnosis)
df.diagnosis

In [None]:
# split data 
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score , confusion_matrix

In [None]:
df.columns

In [None]:
x= df.drop('diagnosis',axis=1)
y=df.diagnosis
print(x.shape,y.shape)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=0)

####  Feature scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

In [None]:
## logistic regression

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
# fit model 
model.fit(x_train,y_train)
y_pred = model.predict(x_test)

# accuracy score
acc_log = accuracy_score(y_test,y_pred)
print(acc_log)


In [None]:
## KNN 

from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=9)
# fit model 
model.fit(x_train,y_train)
y_pred = model.predict(x_test)

# accuracy score
acc_knn = accuracy_score(y_test,y_pred)
print(acc_knn)

In [None]:
list1=[]
for neighbors in range(2,30):
    classifier = KNeighborsClassifier(n_neighbors=neighbors, metric='minkowski')
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    list1.append(accuracy_score(y_test,y_pred))
plt.plot(list(range(2,30)), list1)
plt.show()


In [None]:
# svm

from sklearn.svm import SVC
model = SVC(C=0.8)
# fit model 
model.fit(x_train,y_train)
y_pred = model.predict(x_test)

# accuracy score
acc_log = accuracy_score(y_test,y_pred)
print(acc_log)

In [None]:
list1 = []
for c in [0.5,0.6,0.7,0.8,0.9,1.0,1.1,1.2,1.3,1.4,1.5]:
    classifier = SVC(C = c, random_state=0, kernel = 'rbf')
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    list1.append(accuracy_score(y_test,y_pred))
plt.plot([0.5,0.6,0.7,0.8,0.9,1.0,1.1,1.2,1.3,1.4,1.5], list1)
plt.show()


In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(max_leaf_nodes = 6, random_state=0, criterion='entropy')
classifier.fit(x_train, y_train)

# Predicting the test set results

y_pred = classifier.predict(x_test)

# Making the confusion matrix and calculating accuracy score
acc_decisiontree = accuracy_score(y_test, y_pred)

print(acc_decisiontree)



In [None]:
list1 = []
for leaves in range(2,30):
    classifier = DecisionTreeClassifier(max_leaf_nodes = leaves, random_state=0, criterion='entropy')
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    list1.append(accuracy_score(y_test,y_pred))
#print(mylist)
plt.plot(list(range(2,30)), list1)
plt.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
list1 = []
for estimators in range(20,30):
    classifier = RandomForestClassifier(n_estimators = estimators, random_state=0, criterion='entropy')
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    list1.append(accuracy_score(y_test,y_pred))
#print(mylist)
plt.plot(list(range(20,30)), list1)
plt.show()


In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 28, criterion='entropy', random_state=0)
classifier.fit(x_train,y_train)

# Predicting the test set results

y_pred = classifier.predict(x_test)


# Making the confusion matrix and calculating the accuracy score

from sklearn.metrics import confusion_matrix, accuracy_score

acc_randomforest = accuracy_score(y_test, y_pred)

print(acc_randomforest)


In [None]:
#Finding the optimum number of
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
list1 = []
for estimators in range(10,30,1):
    classifier = XGBClassifier(n_estimators = estimators, max_depth=12, subsample=0.7)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    list1.append(accuracy_score(y_test,y_pred))
#print(mylist)
plt.plot(list(range(10,30,1)), list1)
plt.show()

from xgboost import XGBClassifier
classifier = XGBClassifier(n_estimators = 22, max_depth=12, subsample=0.7)
classifier.fit(x_train,y_train)

y_pred = classifier.predict(x_test)
print(y_pred)

# Making the confusion matrix and calculating the accuracy score

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac_xgboost = accuracy_score(y_test, y_pred)
list1.append(ac_xgboost)
print(cm)
print(ac_xgboost)


In [None]:
from catboost import CatBoostClassifier
classifier = CatBoostClassifier()
classifier.fit(x_train, y_train)

y_pred = classifier.predict(x_test)


# Making the confusion matrix and calculating the accuracy score

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
ac_catboost = accuracy_score(y_test, y_pred)
list1.append(ac_catboost)

print(ac_catboost)
