In [None]:
import numpy as np
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

from scipy.stats import zscore
%matplotlib inline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import os

# Problem Statement

Breast cancer is one of the most prevalent forms of cancer affecting women worldwide. Early and accurate detection plays a critical role in improving patient outcomes and survival rates. However, conventional diagnostic methods have limitations, and there is a need for more effective and efficient approaches. This problem aims to develop a machine learning model that can accurately classify breast cancer cases as malignant or benign based on various features

# Data Preparation and Processing

In [None]:
df = pd.read_csv("Datasets/breast_cancer.csv")

In [None]:
df.head(10)

In [None]:
df.shape

In [None]:
df.dtypes

**From initial analysis of the data, it looks that all the columns have correct data types. Further we will check for duplicate rows, missing and unexpected values**

In [None]:
df.duplicated().sum()

In [None]:
df.isnull().sum()

The column "**Unnamed: 32**" will be dropped in later steps.

In [None]:
df.describe().T

In [None]:
df.drop(['id', 'Unnamed: 32'],inplace = True, axis = 1)

**Now, we have a clean dataset. Now we will do Univarate and Bivariate Analysis.**

# Data Analysis

**Univariate Analysis**

In [None]:
diag_count = df['diagnosis'].value_counts()

print(diag_count)

In [None]:
sns.countplot(data = df, x = 'diagnosis')

**Benign cases are more than Malignant. The distribution looks somewhat imbalanced. We will balance it in later stage.**

In [None]:
df.columns

In [None]:
sns.scatterplot(data = df, x = 'radius_mean', y = 'radius_worst', hue = 'diagnosis')

In [None]:
sns.scatterplot(data = df, x = 'texture_mean', y = 'texture_worst', hue = 'diagnosis')

In [None]:
sns.scatterplot(data = df, x = 'perimeter_mean', y = 'perimeter_worst', hue = 'diagnosis')

In [None]:
sns.scatterplot(data = df, x = 'area_mean', y = 'area_worst', hue = 'diagnosis')

smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean'

In [None]:
sns.scatterplot(data = df, x = 'smoothness_mean', y = 'smoothness_worst', hue = 'diagnosis')

In [None]:
sns.scatterplot(data = df, x = 'compactness_mean', y = 'compactness_worst', hue = 'diagnosis')

In [None]:
sns.scatterplot(data = df, x = 'concavity_mean', y = 'concavity_worst', hue = 'diagnosis')

In [None]:
sns.scatterplot(data = df, x = 'concave points_mean', y = 'concave points_worst', hue = 'diagnosis')

In [None]:
sns.scatterplot(data = df, x = 'symmetry_mean', y = 'symmetry_worst', hue = 'diagnosis')

In [None]:
sns.scatterplot(data = df, x = 'fractal_dimension_mean', y = 'fractal_dimension_worst', hue = 'diagnosis')

**Inferences**

* For Smaller radius - Benign.
* For smaller texture - Benign but some are benign with larger texture as well.
* For small Perimeter - Benign
* For small Area - Benign
* In smoothness, Malignant and Benign are almost even spread out.
* Benign is less compact.
* Less concavity and Convex Points - Benign
* Based on symmetry and fractal dimension - Both diagnosis are evenly spread out.


# Basic Modelling

In [None]:
X = df.drop(['diagnosis'], axis = 1)
y = df[['diagnosis']]

In [None]:
diagnosis_count = y.value_counts()

print(diagnosis_count)

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer

smo = SMOTE()
X_balanced, Y_balanced = smo.fit_resample(X, y)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_balanced, Y_balanced, test_size = 0.3, random_state = 1)

In [None]:
diagnosis_count_bal = Y_balanced.value_counts()

print(diagnosis_count_bal)

In [None]:
algo = [LogisticRegression, KNeighborsClassifier]

In [None]:
def algo_(x_train, x_test, y_train, y_test, algo):
    scores = []
    for alg in algo:
        model = alg()
        model.fit(x_train, y_train)
        y_predict = model.predict(x_test)
        train_score = model.score(x_train, y_train)
        test_score = model.score(x_test, y_test)
        
        scores.append((alg.__name__, train_score, test_score))
    
    score_df = pd.DataFrame(scores, columns=['Algorithm', 'In Sample Score', 'Out of Sample Score'])
    return score_df
        
    

In [None]:
result_df = algo_(x_train, x_test, y_train, y_test, algo)
print(result_df)

The out-sample score for Logistic Regression is around 95%. Now, we will try to do PCA

**Since, there are many columns, we will do PCA to reduce columns and extract features**

# Principle Component Analysis

In [None]:
df_pca = df.copy()

In [None]:
X = df_pca.drop(['diagnosis'], axis = 1)
y = df_pca[['diagnosis']]

In [None]:
XScaled = X.apply(zscore)
XScaled.head()

In [None]:
covMatrix = np.cov(XScaled, rowvar = False)
print(covMatrix)

In [None]:
pca = PCA(n_components = 30)
pca.fit(XScaled)

In [None]:
plt.bar(list(range(1,31)), pca.explained_variance_ratio_, alpha = 0.5, align = 'center')
plt.ylabel('Var Explained')
plt.xlabel('Eigen value')
plt.show()

In [None]:
plt.step(list(range(1,31)), np.cumsum(pca.explained_variance_ratio_), where = 'mid')
plt.ylabel('Cumulative of Var Explained')
plt.xlabel('Eigen value')
plt.show()

In [None]:
pca3 = PCA(n_components = 7)
pca3.fit(XScaled)

print(pca3.components_)
print(pca3.explained_variance_)
Xpca3 = pca3.transform(XScaled)

In [None]:
sns.pairplot(pd.DataFrame(Xpca3))

# MODELLING AFTER PCA

In [None]:
smo = SMOTE()
X_balanced, Y_balanced = smo.fit_resample(Xpca3, y)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_balanced, Y_balanced, test_size = 0.3, random_state = 1)

In [None]:
model_pca = LogisticRegression()
model_pca.fit(x_train, y_train)
y_predict = model_pca.predict(x_test)
train_score = model_pca.score(x_train, y_train)
test_score = model_pca.score(x_test, y_test)

In [None]:
print("LOGISTIC REGRESSION")
print("In Sample score is : ",train_score)
print('_________' * 7)
print('_________' * 7)
print("Out-sample score is : ", test_score)

In [None]:
model_pca_kn = KNeighborsClassifier()
model_pca_kn.fit(x_train, y_train)
y_predict = model_pca_kn.predict(x_test)
train_score = model_pca_kn.score(x_train, y_train)
test_score = model_pca_kn.score(x_test, y_test)

In [None]:
print('KNearest Neighbours')
print("In Sample score is : ",train_score)
print('_________' * 7)
print('_________' * 7)
print("Out-sample score is : ", test_score)

# Final

**Logistic Regression without PCA**
* Score on Training Data = 94%
* Score on Test Data = 95%

**KNeighbours Classifier without PCA**
* Score on Training Data = 93%
* Score on Test Data = 94%

**Logistic Regression with PCA**
* Score on Training Data = 97%
* Score on Test Data = 98%

**KNeighbours Classifier with PCA**
* Score on Training Data = 98%
* Score on Test Data = 96%