In [None]:
# This R environment comes with many helpful analytics packages installed
# It is defined by the kaggle/rstats Docker image: https://github.com/kaggle/docker-rstats
# For example, here's a helpful package to load

library(tidyverse) # metapackage of all tidyverse packages

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

list.files(path = "../input")

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = read.delim("https://raw.githubusercontent.com/AdiPersonalWorks/Random/master/student_scores%20-%20student_scores.csv",stringsAsFactors=F, header = T, sep=",")

In [None]:
head(data)

In [None]:
library(Hmisc)

In [None]:
describe(data)

In [None]:
# list rows of data that have missing values
data[!complete.cases(data),]

In [None]:
boxplot(data$Hours)

In [None]:
library("dplyr")
library("ggpubr")

In [None]:
#Density plot and Q-Q plot can be used to check normality visually
library("ggpubr")
ggdensity(data$Hours, 
          main = "Density plot of No. of study hours",
          xlab = "Study Hours")

In [None]:
library("car")
qqPlot(data$Hours)

In [None]:
#The R function shapiro.test() can be used to perform the Shapiro-Wilk test of normality for one variable (univariate)
shapiro.test(data$Hours)

In [None]:
hist(data$Hours)

In [None]:
lm = lm(Scores~Hours, data = data) #Create the linear regression
summary(lm) #Review the results

In [None]:
plot(data, pch = 16, col = "blue") #Plot the results
abline(lm) #Add a regression line

In [None]:
plot(lm$residuals, pch = 16, col = "red")

In [None]:
plot(Scores~Hours, data = data)
abline(lm)

In [None]:
library(openintro)
library(dplyr)
library(ggplot2)
library(e1071)

In [None]:
skewness(data$Hours)

In [None]:
lm_log.model = lm(log1p(Scores) ~ log1p(Hours), data = data)

In [None]:
summary(lm_log.model)

In [None]:
data$log_hours = log(data$Hours)

In [None]:
hist(data$log_hours)

In [None]:
ggdensity(data$log_hours, 
          main = "Density plot of log transformed hours",
          xlab = "Study Hours")

In [None]:
lm_log.model2 = lm(log1p(Scores) ~ log_hours, data = data)

In [None]:
summary(lm_log.model2)

In [None]:
#Now we can drop the newly log transformed column as it has no signifanct effect
data <- subset(data,,-c(3))
head(data)

In [None]:
#Create data partition using 80-20 rule
library(caret)
set.seed(99)
index<-createDataPartition(data$Scores, p=0.60, list=FALSE)
train<-data[index,]
test<-data[-index,]

In [None]:
#Applying linear regression using lm()
set.seed(99)
caret_lm<-lm(Scores~.,train)#nzv,range,YeoJohnson
caret_lm_tst_pred<- predict(caret_lm, test)
plot(caret_lm_tst_pred, test$Scores,
    xlab= "Predicted", ylab= "Actual",
    main="Predicted vs Actual: Linear", col="blue", pch=18)
grid()
abline(0, 1, col = "red", lwd = 2)

In [None]:
summary(caret_lm)

In [None]:
postResample(caret_lm_tst_pred, test$Scores)

In [None]:
#Using caret's train() function
set.seed(999)
control<-trainControl(method="repeatedcv", number=10, repeats=3)
caret_lm2<-train(Scores~.,train, method="lm", trControl=control, preprocess=c("center","scale","BoxCox"))#nzv,range,YeoJohnson
clv_lm_tst_pred2<- predict(caret_lm2, test)
plot(clv_lm_tst_pred2, test$Scores,
    xlab= "Predicted", ylab= "Actual",
    main="Predicted vs Actual: Linear", col="blue", pch=18)
grid()
abline(0, 1, col = "red", lwd = 2)

In [None]:
summary(caret_lm2)

In [None]:
postResample(clv_lm_tst_pred2, test$Scores)

In [None]:
#Implementing Random Forest where I set mtry=no.of predictors/3
library(randomForest)
set.seed(999)
rf<-randomForest(Scores~.,train,mtry=0.33,importance=TRUE,ntrees=100)
rf
rf_tst_pred<- predict(rf, test)
plot(rf_tst_pred, test$Scores,
    xlab= "Predicted", ylab= "Actual",
    main="Predicted vs Actual: Linear", col="blue", pch=18)
grid()
abline(0, 1, col = "red", lwd = 2)

summary(rf)
#(lm_tst_rmse2 = calc_rmse(clv_lm_tst_pred2, test$CLV))#14.4812044544613
#summary(clv_lm2)$r.squared
postResample(rf_tst_pred, test$Scores)

In [None]:
summary(lm)

In [None]:
summary(lm_log.model)

*Finally, we can conclude simple linear regression gives me the better result than Random Forest. And, Residual standard error significantly dropped in case of log transformed model output.* 
**Scores = a + Hours*b**
You can see the values of the intercept and the slope for the scores. These “a” and “b” values plot a line between all the points of the data. So in this case, if there is a student, who studies for 5 hours per day, intercept(a) is 2.4837 and slope(b) is 9.7758, the model predicts (on average) that student socres around 2.4837 + (5 * 9.7758) = 51.33627