In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sk


In [None]:
df=pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')

In [None]:
df.info

df.head()

In [None]:
#lets drop the unwanted columns, id & unknown are not required
drop=['id','Unnamed: 32']
for i in drop:
    df=df.drop(i, axis=1)

In [None]:
df.describe()

In [None]:
df.shape

##### We have total  of 569 rows & 31 columns to be analysed.
#### Our target variable is diagnosis, M represents Malign & B represents Benign. 


In [None]:
# Diagnosis is the only object type column. We will decode it to 0 & 1. O 0s Benign & 1 is Malign

category={'B':0,'M':1}
df['diagnosis']=df['diagnosis'].map(category)

In [None]:
df.head()

In [None]:
df['diagnosis'].dtype

In [None]:
df.isnull().sum() #there are no null values

In [None]:
df.dtypes

### Univariate Analysis

In [None]:
df['diagnosis'].value_counts()

In [None]:
#plotting categorical variable
plt.figure(figsize=(10,8))
df['diagnosis'].value_counts().plot(kind='bar',colormap='Blues_r')
plt.title(f"Plotting the diagnosis")
plt.show()

In [None]:
col=df.columns
col.tolist()

In [None]:
col=col.drop('diagnosis')

In [None]:
col

In [None]:
plt.figure(figsize=(20,20))
plt.tight_layout()
for key, i in enumerate(col):
    plt.subplot(6,5,key+1)
    g= sns.distplot(df[i],label='Skewness:{:.2f}'.format(df[i].skew()),kde=True)
    plt.legend(loc='best')
    plt.title(f"Plotting the {i}")
    plt.tight_layout()

#### You see most of the distributions are left skewed, we need normal distribution. we will look into it later. first lets look into correlation between the variables and specifically with target variable.

### Bivariate Analysis

In [None]:
plt.figure(figsize=(10,8));
for key, i in enumerate(col):
    g=sns.displot(data=df, x=i, hue="diagnosis", kind="kde")
    plt.suptitle(f"Plotting the {i}")
    plt.tight_layout()

In [None]:
df_corr=df.corr()
plt.figure(figsize=(18,12))
sns.heatmap(df_corr, annot=True)

In [None]:
#correlation with target variable
plt.figure(figsize=(12,8))
df_corr['diagnosis'].sort_values(ascending=True)[:-1].plot(kind="barh")
plt.title("Correlation with target variable")
plt.xlabel("Correlation")
plt.tight_layout()
plt.show()

we see multicolinearity in the heatmap, we will examine highly correlated variables.


If we observe; radius_worst, perimeter_worst & area_worst are has correaletion of 0.9 with radius mean. Similarly, if we look at the correlation of **_worst** features with that of **_mean** & **_se** features, the correlation is quite high (between 0.7-.95) which evidently signify that the "worst" columns are the subset of the "mean" columns which is nothing but both contain similar type of data. 


Multicollinearity undermines the significance of the independent variables, it is important to treat them before we build the model. 


We will use PCA to handle the multicollinearity problem, **Principal Component Analysis(PCA)** is a common feature extraction technique in data science that employs matrix factorization to reduce the dimensionality of data into lower space.

### Dealing with Multicollinearity & feature selection Using PCA

**PCA** technique is particularly useful in processing data where multi-colinearity exists between the features/variables. PCA can be used when the dimensions of the input features are high.



In [None]:
#before we begin the PCA, we need to scale the data.
#split the dataset
X=df.drop("diagnosis",axis=1)
y=df["diagnosis"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

We have to standardize the data before implementing PCA. This is absolutely necessary because PCA calculates a new projection of our data on a new axis using the standard deviation of our data. PCA gives more weight to variables that have higher variances than variables with low variances, so it is important to normalize the data on the same scale to get a reasonable covariance.

In [None]:
#performing scaling
from sklearn.preprocessing import StandardScaler
std=StandardScaler()
X_train_std=std.fit_transform(X_train)
X_test_std=std.fit_transform(X_test)

In [None]:
#import PCA module
from sklearn.decomposition import PCA

pca=PCA(n_components=X_train_std.shape[1])
pca_data=pca.fit_transform(X_train_std)

percent_var_explained = pca.explained_variance_/(np.sum(pca.explained_variance_))
cumm_var_explained = np.cumsum(percent_var_explained)

plt.plot(cumm_var_explained)
plt.grid()
plt.xlabel("n_components")
plt.ylabel("% variance explained")
plt.show()

In [None]:
cumm_var_explained

In [None]:
pca.explained_variance_

The total variance for data captured for 1st PCA is 0.43, 1st two PCA is 0.63, 1st three  PCA is 0.72 and for first 17 PCA is 0.9916.

For the individual variance captured the variance of data captured by 1st PCA is 1.303, for 2nd PCA is 6.14522007e+00 , 3rd PCA is 2.72676520e+00, and the last PCA is 1.31517117e-04.

***Since 99.16% of the total variance is captured by the 1st 17 PCA itself, we take only 17 components of PCA and compute a correlation heatmap to overserve the multicollinearity.***

In [None]:
sum(pca.explained_variance_ratio_)

In [None]:
pca = PCA(n_components=17)
pca_train_data = pca.fit_transform(X_train_std)
pca_test_data = pca.transform(X_test_std)

In [None]:
#correlation after PCA

df_train_pca = pd.DataFrame(pca_train_data)
df_train_pca["diagnosis"] = y_train

corr = df_train_pca.corr()
plt.figure(figsize=(18,12))

sns.heatmap(corr, annot = True, vmin=-1, vmax=1, cmap="YlGnBu", linewidths=.5)
plt.grid(b=True, color='#f68c1f', alpha=0.1)
plt.show()



Hence by reducing the dimensionality of the data using PCA, the variance is preserved by approx 99% and multicollinearity of the data is removed.

In [None]:
def pca_dec(data, n):
  pca = PCA(n)
  X_dec = pca.fit_transform(data)
  return X_dec, pca

#Decomposing the train set:
pca_train_results, pca_train = pca_dec(X_train_std, 17)

#Decomposing the test set:
pca_test_results, pca_test = pca_dec(X_test_std, 17)

#Creating a table with the explained variance ratio
names_pcas = [f"PCA Component {i}" for i in range(1, 18, 1)]
scree = pd.DataFrame(list(zip(names_pcas, pca_train.explained_variance_ratio_)), columns=["Component", "Explained Variance Ratio"])
print(scree)

In [None]:
#Sorting the values of the first principal component by how large each one is
df1 = pd.DataFrame({'PCA':pca_train.components_[0], 'Variable Names':list(X_train.columns)})
df1 = df1.sort_values('PCA', ascending=False)

#Sorting the absolute values of the first principal component by magnitude
df2 = pd.DataFrame(df1)
df2['PCA']=df2['PCA'].apply(np.absolute)
df2 = df2.sort_values('PCA', ascending=False)

df2.head(17)

Now that we know which are the important features that has the most impact, we move on to building the model.

We will try the accuracy using multiple models on the raw training data & the pca dataframe df_train_pca. Just to undertstand if PCA has helped in increasing the accuracy.


### 1. XGBoost Algorithm

#### a. original Dataframe

In [None]:
import xgboost as xgb
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score

In [None]:
xgb_classifier=xgb.XGBClassifier(random_state=42)

In [None]:
# fit the model (using train & test data from original Dataset)
xgb_classifier.fit(X_train_std, y_train)

In [None]:
# make predictions for test data
y_pred= xgb_classifier.predict(X_test_std)
predictions = [round(value) for value in y_pred]

In [None]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot= True, fmt = 'd')

#### 2. PCA Dataframe

In [None]:
#defining training & test data for PCA component.
X_train_pca= pd.DataFrame(pca_train_data)
X_test_pca=pd.DataFrame(pca_test_data)

In [None]:
X_train_pca.shape

In [None]:
X_test_pca.shape

In [None]:
#fit the model

xgb_classifier.fit(X_train_pca, y_train)

In [None]:
# make predictions for test data
y_pred_1 = xgb_classifier.predict(X_test_pca)
predictions = [round(value) for value in y_pred_1]

In [None]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
cm = confusion_matrix(y_test, y_pred_1)
sns.heatmap(cm, annot= True, fmt = 'd')

As we observed, the accuracy using PCA dataframe is 93% whereas for the original dataframe is 96%. This signifies the use of PCA or in other words the use of PCA to reduce the multicollinearity has done no good to accuracy.

#### Now lets try other way, by dropping the highly correlated features and check the accuracy score.

Going back to where we performed correlation. We had identified the highly correlated features. Lets plot the correlation heatmap again for our understanding.

### c. Dropping the highly correlated features

In [None]:
#plotting again the original correlation heatmap
df_corr=df.corr()
plt.figure(figsize=(18,12))
sns.heatmap(df_corr, annot=True)

The features which are highly correlated, which means they have almost same data.

In [None]:
drop_col=['radius_mean', 'perimeter_mean', 'texture_mean','area_mean', 'compactness_mean', 'concave points_mean',
'texture_worst',  'concave points_worst', 'perimeter_worst', 'area_worst', 'compactness_worst', 
'area_se', 'radius_se', 'perimeter_se']

new_df=df.drop(drop_col,axis=1)

In [None]:
new_df.shape

In [None]:
new_df_corr=new_df.corr()
plt.figure(figsize=(18,12))
sns.heatmap(new_df_corr, annot=True)

In [None]:
#splitting the new_df
X_new=new_df.drop("diagnosis",axis=1)
y_new=new_df["diagnosis"]
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_new, y_new, test_size=0.3)

In [None]:
std=StandardScaler()
X_train2_std=std.fit_transform(X_train2)
X_test2_std=std.fit_transform(X_test2)

In [None]:
xgb_classifier.fit(X_train2_std, y_train2)

In [None]:
# make predictions for test data
y_pred_2 = xgb_classifier.predict(X_test2_std)
predictions = [round(value) for value in y_pred_2]

In [None]:
# evaluate predictions
accuracy = accuracy_score(y_test2, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
cm = confusion_matrix(y_test2, y_pred_2)
sns.heatmap(cm, annot= True, fmt = 'd')