In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Benign and Malignant are two diagnosis mentioned in data, Lets see what they mean.

**Benign** Diagnosis: A benign tumor means the tumor can grow but will not spread. Some types of cancer do not form a tumor.

**Malignant** Diagnosis: A cancerous tumor is malignant, meaning it can grow and spread to other parts of the body.



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly import tools
from plotly.subplots import make_subplots

In [None]:
df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isna().sum()

In [None]:
df.drop(['id','Unnamed: 32'],1,inplace=True)

In [None]:
fig = px.bar(x=df['diagnosis'].value_counts().index, y=df['diagnosis'].value_counts(), text=(df['diagnosis'].value_counts()/len(df['diagnosis'])*100))
fig.update_traces(marker_color=['crimson','maroon'], textposition='outside', texttemplate='%{text:.4s}%')
fig['layout'].update(height=500, width=600, title='Diagnosis Distribution')
fig.show()

We have slight imbalance data. Diagnosis **benign** is 62.7% and **malignant** diagnosis is 37.2%

In [None]:
for i in df.iloc[:,1:]:
    plt.show()
    sns.distplot(df[i])
    plt.xlabel(i, fontsize=12)
    plt.ylabel('Count', fontsize=13)
    plt.tight_layout();

We have skewed distribution.

# Bivariate Analysis

In [None]:
for i in df.iloc[:,1:]:
    fig = px.histogram(df, x=i, title=str(i)+' Distribution', height=500, width=700, nbins=100, color='diagnosis')
    fig.show()

Lot of overlap between two Disgnosis.

In [None]:
# Encoding Target variable
df['diagnosis'] = pd.factorize(df['diagnosis'])[0]


**Looking for Correlation**

In [None]:
corr = df.corr()
correlated = corr['diagnosis'].apply(np.abs).sort_values(ascending=False).index
corr_sorted = corr[correlated].reindex(correlated)
corr_sorted.head()

In [None]:
most_corr = corr_sorted[(corr_sorted.iloc[:,0 ]>=0.7) | (corr_sorted.iloc[:,0] <= -0.7)].index

corr_sorted = corr_sorted.loc[most_corr, most_corr]

corr_sorted

In [None]:
fig, ax=plt.subplots(figsize=(15, 7))
cmap=sns.diverging_palette(20, 220, n=200)

sns.heatmap(corr_sorted, ax=ax, annot=True, cmap=cmap)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
x = df.drop('diagnosis',1)
y = df['diagnosis']

print(x.shape)
print(y.shape)

**Scaling Data**

In [None]:
sc = StandardScaler()
x = sc.fit_transform(x)

**Applying PCA**

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
x = pca.fit_transform(x)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=10, test_size=0.2)

# Logistic Regression# 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

logr = LogisticRegression()

logr.fit(x_train, y_train)

In [None]:
y_pred = logr.predict(x_test)

In [None]:
print('Accuracy Score Of Logistic Regression Model is: ', accuracy_score(y_test, y_pred))

In [None]:
con = confusion_matrix(y_pred, y_test)
con