In [1]:
# read data
library(readxl)
data=read_excel("hospitalcosts.xlsx")

### Basic EDA

In [2]:
#view head of data
head(data)

AGE,FEMALE,LOS,RACE,TOTCHG,APRDRG
17,1,2,1,2660,560
17,0,2,1,1689,753
17,1,7,1,20060,930
17,1,1,1,736,758
17,1,1,1,1194,754
17,0,0,1,3305,347


In [3]:
#view tail of data
tail(data)

AGE,FEMALE,LOS,RACE,TOTCHG,APRDRG
0,1,3,1,1886,640
0,1,6,1,5881,636
0,1,2,1,1171,640
0,1,2,1,1171,640
0,1,2,1,1086,640
0,0,4,1,4931,640


In [4]:
#print number of row in data
obs = nrow(data)
print(obs)

#print number of cols in data
nCols = ncol(data)
print(nCols)

#print dimension of data
dims = dim(data)
print(dims)

[1] 500
[1] 6
[1] 500   6


In [5]:
#name of features
names(data)

In [6]:
#view structure of data
str(data)

Classes 'tbl_df', 'tbl' and 'data.frame':	500 obs. of  6 variables:
 $ AGE   : num  17 17 17 17 17 17 17 16 16 17 ...
 $ FEMALE: num  1 0 1 1 1 0 1 1 1 1 ...
 $ LOS   : num  2 2 7 1 1 0 4 2 1 2 ...
 $ RACE  : num  1 1 1 1 1 1 1 1 1 1 ...
 $ TOTCHG: num  2660 1689 20060 736 1194 ...
 $ APRDRG: num  560 753 930 758 754 347 754 754 753 758 ...


In [7]:
#view class of data
class(data)

In [8]:
# Exploring Age Variable:Age of Patients
summary(data$AGE)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   0.000   0.000   5.086  13.000  17.000 

In [9]:
var(data$AGE)

In [10]:
sd(data$AGE)

### Data Modelling

In [11]:
#split data in train and test set
library(caTools)
set.seed(1)
sample=sample.split(data$TOTCHG,SplitRatio = 0.80)
train_data=subset(data,sample==TRUE)
test_data=subset(data,sample==FALSE)

"package 'caTools' was built under R version 3.6.3"

In [12]:
#fit linear regression model
model=lm(TOTCHG~.,data = train_data)
summary(model)


Call:
lm(formula = TOTCHG ~ ., data = train_data)

Residuals:
   Min     1Q Median     3Q    Max 
 -6027   -853   -159    146  42906 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 5123.8780   618.0184   8.291 1.80e-15 ***
AGE          150.1851    21.2746   7.059 7.62e-12 ***
FEMALE      -343.0310   300.4870  -1.142    0.254    
LOS          789.1367    49.0452  16.090  < 2e-16 ***
RACE        -252.1402   273.3765  -0.922    0.357    
APRDRG        -7.8197     0.8176  -9.565  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2841 on 393 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.4897,	Adjusted R-squared:  0.4832 
F-statistic: 75.44 on 5 and 393 DF,  p-value: < 2.2e-16


In [13]:
#refit with only significant vars
newModel=lm(TOTCHG~AGE+LOS+APRDRG,data = train_data)
summary(newModel)


Call:
lm(formula = TOTCHG ~ AGE + LOS + APRDRG, data = train_data)

Residuals:
   Min     1Q Median     3Q    Max 
 -6201   -873   -139    163  42864 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 4801.1658   520.7266   9.220  < 2e-16 ***
AGE          144.7810    20.7931   6.963 1.39e-11 ***
LOS          789.4263    48.9385  16.131  < 2e-16 ***
APRDRG        -7.9746     0.7942 -10.042  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2838 on 396 degrees of freedom
Multiple R-squared:  0.4873,	Adjusted R-squared:  0.4834 
F-statistic: 125.4 on 3 and 396 DF,  p-value: < 2.2e-16


In [14]:
#make prediction
TOTCHG_pred=predict(newModel,newdata = test_data)
TOTCHG_pred1=data.frame(TOTCHG_pred)

In [15]:
head(TOTCHG_pred1)

TOTCHG_pred
2007.1552
4495.2721
4407.3322
3941.0909
997.4049
388.9155


In [16]:
#combine source data and write to local with preds
final_data=cbind(test_data,TOTCHG_pred1)
write.csv(final_data,"final_data.csv")