In [None]:
import pandas as pd
cancer=pd.read_csv('../input/breastcancerdataset/BRCA.csv')
cancer.head()

In [None]:
cancer.dropna(subset = ["Patient_ID"], inplace=True)
cancer.isnull().sum()

In [None]:
cancer.info()

In [None]:
cancer.describe()

In [None]:
cancer.Patient_Status.value_counts()

In [None]:
cancer = cancer[cancer.Patient_Status.notna()]
cancer.shape

In [None]:
round(cancer.Patient_Status.value_counts()*100/len(cancer),2)

In [None]:
cancer['Date_of_Last_Visit'].fillna(cancer['Date_of_Surgery'], inplace=True)
cancer.isnull().sum()

In [None]:
cancer.info()

In [None]:
cancer.head()

In [None]:
cancer['Date_of_Surgery'] =pd.to_datetime(cancer['Date_of_Surgery'], format='%d-%b-%y')
cancer['Date_of_Last_Visit'] =pd.to_datetime(cancer['Date_of_Last_Visit'], format='%d-%b-%y')
cancer.info()

In [None]:
cancer.nunique()

In [None]:
cancer=cancer.drop(['ER status','PR status'],axis=1)
cancer.head()

In [None]:
cancer.skew()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
for i in ['Age','Protein1','Protein2','Protein3','Protein4']:
    plt.figsize=(10,5)
    sns.boxplot(y=cancer[i])
    plt.show()

In [None]:
!pip install pandas-profiling

In [None]:
from pandas_profiling import ProfileReport
ProfileReport(cancer, title="EDA Report")

In [None]:
!pip install pycaret

In [None]:
data = cancer.sample(frac=0.8, random_state=42)
data_unseen = cancer.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))

In [None]:
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

In [None]:
from pycaret.classification import *

In [None]:
data.info()

In [None]:
cancer= setup(data = data, target = 'Patient_Status', session_id=42,
                  normalize = True, 
                  transformation = True, 
                  log_experiment = True,
                  handle_unknown_categorical = True, 
                  unknown_categorical_method = 'most_frequent',
                  remove_multicollinearity = True, #rop one of the two features that are highly correlated with each other
                  ignore_low_variance = True,#all categorical features with statistically insignificant variances are removed from the dataset.
                  combine_rare_levels = True,# all levels in categorical features below the threshold defined in rare_level_threshold param are combined together as a single level
                  numeric_imputation='median',
           ignore_features=['Patient_ID'],
            date_features=['Date_of_Surgery','Date_of_Last_Visit'],
           fix_imbalance = True,
            train_size = 0.8
          )

In [None]:
rf = create_model('rf')

In [None]:
print(rf)

In [None]:
tuned_rf = tune_model(rf,optimize = 'Precision')

In [None]:
plot_model(tuned_rf, plot = 'auc')

In [None]:
plot_model(tuned_rf, plot = 'pr')

In [None]:
plot_model(tuned_rf, plot='feature')

In [None]:
plot_model(tuned_rf, plot = 'confusion_matrix')

In [None]:
evaluate_model(tuned_rf)

In [None]:
predict_model(tuned_rf)

In [None]:
unseen_predictions = predict_model(tuned_rf, data=data_unseen)
unseen_predictions.head()

In [None]:
print("Confidence Score :   {}".format(round(unseen_predictions.Score.mean(),2)))#Confidence Score