# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import zipfile
import sys

!{sys.executable} -m pip install plotly
import plotly.express as px
from jupyterthemes import jtplot
jtplot.style(theme = 'monokai', context = 'notebook', ticks = True, grid = False) 

# Data Cleaning


In [None]:
data = pd.read_csv("cervical_cancer.csv") # reading the data

data.info()

In [None]:
# Converting missing values to nan's
data.replace('?', np.nan, inplace = True)
data

In [None]:
# generating heatmap to see which columns have a lot of null values
plt.figure(figsize = (20, 20))
sns.heatmap(data.isnull())


In [None]:
# Removing the columns which were having a lot of Nan's and visualising
data = data.drop(columns = ['STDs: Time since first diagnosis', 'STDs: Time since last diagnosis'])


In [None]:
plt.figure(figsize = (20, 20))
sns.heatmap(data.isnull())

In [None]:
# Converting non numerics to numeric vals
data = data.apply(pd.to_numeric, errors = 'coerce')
data.info()


In [None]:
data.mean()

In [None]:
data = data.fillna(data.mean()) # filling na values with correspinding column mean

In [None]:
sns.heatmap(data.isna()) # the heat map suggests all the na's are effectively removed

In [None]:
# getting correlation matrix
corr_matrix = data.corr()
corr_matrix

In [None]:
# Visualizing the correlation
plt.figure(figsize = (30,30))
sns.heatmap(corr_matrix, annot = True)
plt.show()

In [None]:
data.hist(figsize = (30, 30), bins= 11)  # creating histogram of all column values

# Data Preprocessing

In [None]:
target_df = data['Biopsy']
input_df = data.drop(columns = ['Biopsy'])

In [None]:
X = np.array(input_df).astype('float32')
y = np.array(target_df).astype('float32')


In [None]:
# standardiszing the input matrix such that mean of each feature (column) is zero and sd is 1 (Z value)
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
X

In [None]:
# Splitting the data into test set and training set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)


# Applying XG-Boost

In [None]:
import sys
!{sys.executable} -m pip install xgboost


In [None]:
import xgboost as xgb

model = xgb.XGBClassifier(learning_rate = 0.1, max_depth = 50, n_estimators = 100)

model.fit(X_train, y_train)  # Fitting the model


In [None]:
accuracy_train = model.score(X_train, y_train)  # Checking accuracy of model
accuracy_train

In [None]:
accuracy_test = model.score(X_test, y_test)
accuracy_test

In [None]:
# make predictions on the test data
y_predict = model.predict(X_test)

In [None]:
# Viewing metrics of model
from sklearn.metrics import confusion_matrix, classification_report
print(classification_report(y_test, y_predict))

In [None]:
# generating confusion matrix to see how accurate was our model
cm = confusion_matrix(y_predict, y_test)
sns.heatmap(cm, annot = True)