In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
sns.set()

In [None]:
pd.options.display.max_columns = None #Display all columns

In [None]:
df = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
df.tail()

In [None]:
df.describe()

In [None]:
def show_info(data):
    data_info = data.info()
    data_shape = data.shape
    data_null = data.isna().sum()
    return data_info,data_null, print('Data shape:', data_shape)

In [None]:
show_info(df)

In [None]:
cat_data = df['diagnosis']
array = np.array(cat_data)
full_list = list(array)

In [None]:
# A counter function. Similar to .value_counts()
def counter(data):
    data_map = {}
    
    for element in data:
        if element not in data_map:
            data_map[element] = 1
        else:
            data_map[element] += 1
        
    return data_map

In [None]:
data_counter = counter(full_list)
print(data_counter['M'], ';',  data_counter['B'])

In [None]:
sns.countplot(cat_data)

In [None]:
# Map Malignant to 0 and Benign to 1 (Targets)
diagnosis = df['diagnosis'] = df['diagnosis'].map({'M': 0, 'B':1})

In [None]:
df.hist(bins = 50, figsize=(30,20))
plt.show()

In [None]:
train_set, test_set = train_test_split(df, test_size = 0.2, random_state = 42)

In [None]:
def train_test_size_ratio(train_set, test_set):
    train_rows = train_set.shape[0]
    test_rows = test_set.shape[0]
    test_percentage = int((test_rows/train_rows)*100)
    train_percentage = 100 - test_percentage
    return test_percentage, train_percentage

In [None]:
print('Test-Train ratio:', train_test_size_ratio(train_set,test_set))

In [None]:
correlation_matrix = df.corr().round(1)

In [None]:
#We search for the correlation between attributes and diagnosis
correlation_matrix['diagnosis'].sort_values(ascending = False) 

In [None]:
mask = np.triu(np.ones_like(correlation_matrix, dtype= bool))

sns.set_style(style = 'white')
f, ax = plt.subplots(figsize=(70, 20))
plt.title('Attributes Correlation',fontsize = 20)
cmap = sns.diverging_palette(10, 250, as_cmap=True)

sns.heatmap(correlation_matrix, mask=mask, cmap='Blues', annot = True,
            square=True, vmin = 0, vmax = 1,linewidths=.5, ax=ax)

In [None]:
# Strong linear correlation with diagnosis attributes
attributes = ['radius_worst','diagnosis','concave points_mean',
              'perimeter_worst','concave points_worst']

In [None]:
# Plotting those promising attributes
plot = sns.pairplot(data = df[attributes])

In [None]:
df.columns

In [None]:
targets = df.diagnosis

In [None]:
drop_columns = ['id','Unnamed: 32','diagnosis']
attributes = df.drop(labels = drop_columns, axis = 1) #Drop information
attributes.head()

In [None]:
scaler = StandardScaler()
scaler.fit(attributes) #We scale the attributes of the model

In [None]:
scaled_numerical = scaler.transform(attributes)

In [None]:
df_scaled_numerical = pd.DataFrame(data = scaled_numerical,
                  columns = [ 'radius_mean','texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'] )
df_scaled_numerical.head()

# Logistic Regression

In [None]:
log_reg = LogisticRegression() 

In [None]:
log_reg.fit(df_scaled_numerical, targets) #We train the model

In [None]:
predictions = log_reg.predict(scaled_numerical)
predictions

In [None]:
score = log_reg.score(df_scaled_numerical, targets)*100
score.round(3)

In [None]:
from sklearn.metrics import confusion_matrix

# Confusion Matrix

In [None]:
conf = confusion_matrix(targets,predictions)
conf