# STEP 1 : Importing Libraries 

In [None]:
library(naniar)
library(tidyverse)
library(C50)
library(ggcorrplot)
library(ggplot2)
library(caTools)

# STEP 2:  Importing Data 

In [None]:
cancer_data <- read.csv("breastcancer.csv")

View(cancer_data)

# STEP 3 : Checking the missing values in the data  using following three methods : 

## Method 1 

In [None]:
sum(is.na(cancer_data)) # checking overall missing values 

## Method 2

In [None]:
vis_miss(cancer_data)  # graphical representation of missing values 

## Method 3 : Checking columnwise missing values 

In [None]:
sapply(cancer_data, function(x)sum(is.na(x)))

# STEP 4 : Handling Data 

In [None]:
# We can change the position of diagnosis column at the end to avoid confusion as it is our dependent variable 
cancer_data <- cancer_data %>% relocate(diagnosis,.after= fractal_dimension_worst)

## We can check the correlation between each factors using corrgram  , before this we will use factors on diagnosis to convert into numeric
## We need to do feature scaling as well as few column values are too high in comparison to others and 
## Lets do it on column number 2 to 5 

In [None]:
cancer_data[, 2:5] <- scale(cancer_data[, 2:5])

cancer_data$perimeter_se <- scale(cancer_data$perimeter_se)
cancer_data$texture_se <- scale(cancer_data$texture_se)
cancer_data$area_se<- scale(cancer_data$area_se)

cancer_data[, 22:25] <- scale(cancer_data[, 22:25])

colnames(cancer_data)

view(cancer_data)


cancer_data$diagnosis <- factor(cancer_data$diagnosis, levels = c("B","M"), labels = c(0,1))

cancer_data$diagnosis <-as.character(cancer_data$diagnosis)
cancer_data$diagnosis <-as.numeric(cancer_data$diagnosis)

str(cancer_data)



r <- cor(cancer_data, use = "complete.obs")
round(r,2)

ggcorrplot(r)

# STEP 4 :  Splitting data into training set and test set 

In [None]:
split_df <- sample.split(cancer_data$diagnosis, SplitRatio = 0.6)

train_cancer_df <- subset(cancer_data, split_df == TRUE )
test_cancer_df <- subset(cancer_data, split_df == FALSE )


# STEP 5 : # Preparing the model C50 , converting the diagnosis column in factor again to use this in our model and  Train   Model  using  Train  Set

In [None]:
train_cancer_df$diagnosis <-as.factor(train_cancer_df$diagnosis)

str(train_cancer_df$diagnosis)
c50_cancer_df <- C5.0(diagnosis~., data = train_cancer_df)
# Plotting the decision tree 
plot(c50_cancer_df)

# View   rules

train_cancer_df_rules <- C5.0(diagnosis ~., data = train_cancer_df, rules = TRUE)
summary(train_cancer_df_rules)


# STEP 6 : Evaluating the model performance 

In [None]:
predict_train <-predict(c50_cancer_df,newdata = train_cancer_df, type =  "class") 

head(predict_train)

# STEP 7 : Making Confusion Matrix 

In [None]:
cm_train <- table(train_cancer_df[,32], predict_train)

cm_train


head(train_cancer_df[,32])

head(predict_train)


# Now making table of confusion matrix with dimension names : 

table(train_cancer_df[,32], predict_train , dnn =c("Observed Class","Predicted Class"))


# STEP 8  :Now checking test set results 

In [None]:

predict_test <-predict(c50_cancer_df,newdata = test_cancer_df, type =  "class") 
head(predict_test)


# STEP 9 : Now making table of confusion matrix with dimension names : 

In [None]:
table(test_cancer_df[,32], predict_test , dnn =c("Observed Class","Predicted Class"))

str(test_cancer_df[,32])
str(predict_test)


# CONCLUSION : Accuracy of the model is 95.6 % 