# Predicting Breast Cancer Using LogisticRegression

In [None]:
#Importing 
import numpy as np  
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,accuracy_score, confusion_matrix


## Reading the dataset

In [None]:
#Reading dataset
df = pd.read_csv('data.csv')
df.head(5)

In [None]:
print(df.shape)

### Columns/Variables in the dataset

In [None]:
df.columns

The above dataset has 33 columns.

In [None]:
df.info()

#### The column 32(Unnamed:32) has all NULL values and can be removed

In [None]:
df.rename({"Unnamed: 32":"a"}, axis="columns", inplace=True)
df.drop(["a"], axis=1, inplace=True)
df.head(3)

### Target variable diagnosis is a categorial variable M: Malignant, B: Benign. It is changed to M:1, B:0

In [None]:
df['diagnosis'] = df['diagnosis'].apply(lambda x : '1' if x == 'M' else '0')
df = df.set_index('id')
df.head(3)

After removing Unnamed:32 column and makind id column as an index label the dataset has 31 columns/variables(including target variable)

In [None]:
print(len(df.columns))

## Number of benign and malignant observations

In [None]:
# Number of Benign and Malignant observations
benign,malignant = df['diagnosis'].value_counts()
print("Number of Benign patients", benign)
print("Number of Malignant patients", malignant)

plt.figure(figsize = (8,4))
sns.countplot(df['diagnosis'])

In [None]:
y = df['diagnosis'].values # Target variable
X = df.drop('diagnosis', axis =1).values # Feature variables

## Spliting data into train and test sets

In [None]:
#splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2, random_state = 20)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


## Data Normalization/ Feature Scaling

In [None]:
# Normalization
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.fit_transform(X_test)
print(X_train_sc)

## Buiding Logistic Regression

In [None]:
# fitting model
model = LogisticRegression(C = 0.3)
model.fit(X_train_sc, y_train)


In [None]:
X_test_sc[0]

## Predicting

In [None]:
# predicting
y_pred_lr = model.predict(X_test_sc)
accuracy_lr = accuracy_score(y_test,y_pred_lr)
print("Accuracy on Test Data:",accuracy_lr)

## Confusion matrix

In [None]:
confusion_matrix(y_test,y_pred_lr)
lr_cm = confusion_matrix(y_test, y_pred_lr)
lr_cm = pd.DataFrame(lr_cm, columns=['Benign', 'Malignant'], index=['Benign','Malignant'])
lr_cm

## Classification report

In [None]:
print(classification_report(y_test, y_pred_lr))

In [None]:
import pickle
with open('./model.pkl', 'wb') as model_pkl:
    pickle.dump(model, model_pkl)