In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (10,6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')
data

In [None]:
data.info()

In [None]:
px.scatter(data, x = 'radius_mean', y ='texture_mean', color = 'diagnosis')

In [None]:
px.scatter(data, x = 'radius_mean', y ='compactness_mean', color = 'diagnosis')

In [None]:
px.scatter(data, x = 'radius_mean', y ='perimeter_mean', color = 'diagnosis')

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_val_df, test_df = train_test_split(data, test_size = 0.2, random_state = 42)
train_df, val_df = train_test_split(train_val_df, test_size = 0.25, random_state = 42)

In [None]:
print('train_df.shape:', train_df.shape)
print('val_df.shape:', val_df.shape)
print('test_df.shape:', test_df.shape)

In [None]:
input_cols = list(train_df.columns)[2:-1]
print(input_cols)

In [None]:
target_col = 'diagnosis'
print(target_col)

In [None]:
train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_col].copy()

In [None]:
val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_col].copy()

In [None]:
test_inputs = test_df[input_cols].copy()
test_targets = test_df[target_col].copy()

In [None]:
numeric_cols = train_inputs.select_dtypes(include = np.number).columns.tolist()
categoric_cols = train_inputs.select_dtypes('object').columns.tolist()

In [None]:
print(numeric_cols)

In [None]:
print(categoric_cols)

In [None]:
train_inputs[numeric_cols].describe()

In [None]:
data[numeric_cols].isna().sum()

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
scaler.fit(data[numeric_cols])

In [None]:
print('minimum:')
list(scaler.data_min_)

In [None]:
print('maximum:')
list(scaler.data_max_)

In [None]:
train_inputs[numeric_cols] = scaler.transform(train_inputs[numeric_cols])
val_inputs[numeric_cols] = scaler.transform(val_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])

In [None]:
train_inputs[numeric_cols].describe()

In [None]:
!pip install pyarrow

In [None]:
train_inputs.to_parquet('train_inputs.parquet')
val_inputs.to_parquet('val_inputs.parquet')
test_inputs.to_parquet('test_inputs.parquet')

In [None]:
%%time
pd.DataFrame(train_targets).to_parquet('train_targets.parquet')
pd.DataFrame(val_targets).to_parquet('val_targets.parquet')
pd.DataFrame(test_targets).to_parquet('test_targets.parquet')

In [None]:
train_inputs = pd.read_parquet('train_inputs.parquet')
val_inputs = pd.read_parquet('val_inputs.parquet')
test_inputs = pd.read_parquet('test_inputs.parquet')

train_targets = pd.read_parquet('train_targets.parquet')[target_col]
val_targets = pd.read_parquet('val_targets.parquet')[target_col]
test_targets = pd.read_parquet('test_targets.parquet')[target_col]

In [None]:
print('train_inputs:', train_inputs.shape)
print('train_targets:', train_targets.shape)
print('val_inputs:', val_inputs.shape)
print('val_targets:', val_targets.shape)
print('test_inputs:', test_inputs.shape)
print('test_targets:', test_targets.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver = 'liblinear')

In [None]:
%%time
model.fit(train_inputs[numeric_cols], train_targets)

In [None]:
print(model.coef_.tolist())

In [None]:
print(numeric_cols)

In [None]:
print(model.intercept_)

In [None]:
X_train = train_inputs[numeric_cols]
X_val = val_inputs[numeric_cols]
X_test = test_inputs[numeric_cols]

In [None]:
train_preds = model.predict(X_train)

In [None]:
train_preds

In [None]:
train_targets

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(train_targets, train_preds)

In [None]:
from sklearn.metrics import confusion_matrix


In [None]:
confusion_matrix(train_targets, train_preds, normalize='true')

In [None]:
def predict_and_plot(inputs, targets, name =''):
  preds = model.predict(inputs)
  accuracy = accuracy_score(targets, preds)
  print('Accuracy:{:2f}%'.format(accuracy*100))
  cf = confusion_matrix(targets, preds, normalize = 'true')
  plt.figure()
  sns.heatmap(cf, annot = True)
  plt.xlabel('Prediction')
  plt.ylabel('Target')

In [None]:
train_preds = predict_and_plot(X_train, train_targets, name = 'Training')

In [None]:
val_preds = predict_and_plot(X_val, val_targets, name = 'Validation')

In [None]:
test_preds = predict_and_plot(X_test, test_targets, name ='Test')