-
Notifications
You must be signed in to change notification settings - Fork 23
/
Fraud_check.r
125 lines (86 loc) · 3.73 KB
/
Fraud_check.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#Decision Tree
#Use decision trees to prepare a model on fraud data
#treating those who have taxable_income <= 30000 as "Risky" and others are "Good"
install.packages("gmodels")
install.packages("party")
library(party)
library(gmodels)
library(ggplot2)
library(dplyr)
library(caret)
library(C50)
#Lets Import the Data
fraud <- read.csv(file.choose())
dim(fraud) #Returns the rows and Column count of the Dataset
names(fraud) #Returns the Column Names in the dataset
str(fraud) #Gives the Entire Structure of the dataset
#Here we can see that there are 3 Numerical Columns and 3 Categorical variable.
#Lets first convert the Categorical columns into Factors
fraud$Undergrad <- as.factor(fraud$Undergrad)
fraud$Marital.Status <- as.factor(fraud$Marital.Status)
fraud$Urban <- as.factor(fraud$Urban)
#Lets Find the Summary of the Data
summary(fraud)
#Standard Deviation
sd(fraud$Taxable.Income)
sd(fraud$City.Population)
sd(fraud$Work.Experience)
#Variance
var(fraud$Taxable.Income)
var(fraud$City.Population)
var(fraud$Work.Experience)
cor(fraud) #Returns the Correlation Matrix
#Lets Plot the Data
ggplot(fraud) + geom_histogram(aes(Taxable.Income), binwidth = 100, fill = "darkgreen") + xlab("Taxable Income")
ggplot(fraud) + geom_histogram(aes(City.Population), binwidth = 100, fill = "darkgreen")
ggplot(fraud) + geom_histogram(aes(Work.Experience), binwidth = 0.5, fill = "darkgreen")
ggplot(fraud %>% group_by(Undergrad) %>% summarise(Count = n())) + geom_bar(aes(Undergrad, Count), stat = "identity", fill = "green")
ggplot(fraud %>% group_by(Marital.Status) %>% summarise(Count = n())) + geom_bar(aes(Marital.Status, Count), stat = "identity", fill = "green")
ggplot(fraud %>% group_by(Urban) %>% summarise(Count = n())) + geom_bar(aes(Urban, Count), stat = "identity", fill = "green")
#Here we have to Build a classification Tree
#The Condition is if Taxable.Income is <= 30000 then it is "Risky and the rest are "Good"
#So here we will have to convert the Taxable.Income column as Categorical Value with 2 Levels "Risky" and "Good"
#I will use the Ifelse condition to convert the data into Binomial
risk <- ifelse(fraud$Taxable.Income <= 30000, "Risky","Good")
risk <- as.factor(risk)
fraud1 <- cbind(fraud[,-3],risk) #Here we Exclude Taxable.Income Variable as we derived Responsive Variable risk Using it.
#Now lets divide the data into Train and Test Data with 70% partion
intraininglocal <- createDataPartition(fraud1$risk, p=.70, list = F) #Here p=.70 means 70% Partition
train <- fraud1[intraininglocal,]
test <- fraud1[-intraininglocal,]
#Now lets Buld the Decision Tree
mtree <- C5.0(risk~. ,data = train)
#Predict for test data
pred<- predict.C5.0(mtree,newdata = test[,-7])
a<- table(test$risk,pred) #Crating a table with the test$Taxable.Income values and predicted values
sum(diag(a))/sum(a) #to find the Accuracy of the Model.
CrossTable(test$risk,pred)
#Lets Plot the Model
plot(mtree)
#Lets see the Summary of the Model
summary(mtree)
#Lets build a model for whole data
model <- ctree(risk~. ,fraud1)
summary(model)
plot(model)
pred1 <- predict(model)
CrossTable(risk,pred1)
#We can also Include Boosting in Bagging Technique
#We use For loop for bagging in order to make multiple models
acc<- c()
for (i in 1:100) #This will create 500 different models
{
print(i)
intraininglocal <- createDataPartition(fraud1$risk, p=.70, list = F) #Here p=.70 means 70% Partition
train <- fraud1[intraininglocal,]
test <- fraud1[-intraininglocal,]
#Build a Model
fittree <- C5.0(train$risk~. , data = train, trials = 25) #Trials is a Boosting Parameter
pred2<- predict.C5.0(fittree,test[,-7])
a<- table(test$risk, pred2)
#To save the Accuracy of the models
acc<- c(acc,sum(diag(a))/sum(a))
}
summary(acc)
boxplot(acc)
summary(fittree)