Load in the auto mpg data set: https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data

In [None]:
auto_mpg <- read.table("https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data", header = FALSE)
header <- c("mpg","cylinders","displacement","horsepower","weight","acceleration","modelYear","origin","carName")
colnames(auto_mpg) <- header

Identify all of the categorical variables, all of the numeric variables and all of the binary variables.

In [None]:
sapply(auto_mpg,class)
CatCols <- names(which(sapply(auto_mpg, is.factor)))   
NumCols <- names(which(sapply(auto_mpg, is.numeric)))  
LogiCols <- names(which(sapply(auto_mpg, is.logical)))  
CharCols <- names(which(sapply(auto_mpg, is.character))) 
stopifnot(length(CatCols) + length(NumCols) + length(LogiCols) + length(CharCols) == ncol(auto_mpg)) #Verify

      # Categorical: 2; Numerical: 7; Logical: 0; Character: 0
      # Below are few discrepancies
      # Horsepower, carName should not be Factor.
      # Need to convert columns but this might also change any invalid values
      # As per question 5, horsepower has 6 "?" invalid values.
      # We reassign this value to 0 as 0 mpg will not make sense to the data and can be easily identified

autompg1 <- auto_mpg
autompg1$horsepower <- sapply(autompg1$horsepower, function(x) ifelse(x == "?", 0, x)) 
autompg1$horsepower <- as.numeric(autompg1$horsepower)
autompg1$carName <- as.character(autompg1$carName)

CatCols <- names(which(sapply(autompg1, is.factor)))   
NumCols <- names(which(sapply(autompg1, is.numeric)))  
LogiCols <- names(which(sapply(autompg1, is.logical)))  
CharCols <- names(which(sapply(autompg1, is.character))) 
stopifnot(length(CatCols) + length(NumCols) + length(LogiCols) + length(CharCols) == ncol(autompg1)) #Verify
length(CatCols) ; length(NumCols) ; length(LogiCols) ; length(CharCols);
      # After convertion
      # Categorical: 0; Numerical: 8; Logical: 0; Character: 1

Exploratory Data Analysis

In [None]:
summary(autompg1)
          # Min horsepower is 0 which is a reassigned label. It can be ignored.
          # cylinder and origin data are skewed
relCols <- names(autompg1)[!names(autompg1) %in% c(CharCols,LogiCols)]
boxplot(autompg1[,relCols])
          # One outlier in mpg; Multiple outliers in acceleration

pairs(autompg1[,relCols])
corMat <- cor(autompg1[,NumCols])
corMatrix <- corMat
corMatrix[lower.tri(corMatrix,diag=TRUE)]=NA
corMatrix=as.data.frame(as.table(corMatrix))
corMatrix=na.omit(corMatrix)
corMatrix=corMatrix[order(-abs(corMatrix$Freq)),]
corMatrix[1:5,]
          # displacement-cylinders, displacement-weight, cylinder-weight have a strong positive relation
          # mpg-weight, displacement-mpg have a strong negative relation

          ## Var1         Var2       Freq
          ## 18    cylinders displacement  0.9507214
          ## 35 displacement       weight  0.9328241
          ## 34    cylinders       weight  0.8960168
          ## 33          mpg       weight -0.8317409
          ## 17          mpg displacement -0.8042028

hist(autompg1$mpg, xlab = "mpg")    
hist(autompg1$cylinders, xlab = "cylinders")   
hist(autompg1$displacement, xlab = "displacement")  
hist(autompg1$horsepower, xlab = "horsepower")
hist(autompg1$weight, xlab = "weight")
hist(autompg1$acceleration, xlab = "acceleration") 
hist(autompg1$modelYear, xlab = "ModelYear") 

          # Ignoring mpgs less than 10 as they probably are invalid fixes.
          # Cylinder value 4 has maximum frequency.
          # Acceleration distributed normally with mean/med around 15.
          # More observations for car model year 70-71

Correlation matrix for all of the numeric variables.

In [None]:
corrplot(corMat, title = "Correlation plot for auto mpg", diag = FALSE)

Identify the columns (if any) with missing data.

In [None]:
data_issues <- apply(auto_mpg, 2, function(x) length(which(x == "" | x == "?" | is.na(x) | x == "NA" | x == "999" | x == "0")))
data_issues     # data issues resolved into autompg1 
                # horsepower has 6 "?" invalid values

Divide the data into a train/test set (80% and 20% respectively) using stratified sampling

In [None]:
set.seed(18757)
library('caret')
trainPct <- .8
testPct <- 1 - trainPct
inTrain <- createDataPartition(y = autompg1$mpg, p = trainPct, list = FALSE)
mpgTrain <- autompg1[inTrain,]
mpgTest <- autompg1[-inTrain,]
stopifnot(nrow(mpgTrain) + nrow(mpgTest) == nrow(autompg1))

Fit a linear model to the data using the numeric variables only. Calculate the R^2 on the test set.

In [None]:
yVar <- 'mpg'
xVars <- NumCols[!NumCols %in% yVar]

createModelFormula <- function(yVar, xVars, includeIntercept = TRUE){
  if(includeIntercept){
    modelForm <- as.formula(paste(yVar, "~", paste(xVars, collapse = '+ ')))
  } else {
    modelForm <- as.formula(paste(yVar, "~", paste(xVars, collapse = '+ '), -1))
  }
  return(modelForm)
}

modelForm <- createModelFormula(yVar = yVar, xVars = xVars, includeIntercept = FALSE)
model <- lm(modelForm, data = mpgTest)
summary(model)    
        # Adjusted R-squared:  0.979

Identify and remove the non-significant variables (alpha = .05). Fit a new model with those variables removed.

In [None]:
summary(model)

        # from summary(model) cylinder,displacement,horsepower,acceleration,origin 
        # are satistically insignificant on test data. So, we remove both and try again

excl_list <- c('cylinders','displacement','horsepower','acceleration','origin')
xVars <- NumCols[!NumCols %in% c(excl_list, yVar)]   # weight, year
modelForm <- createModelFormula(yVar = yVar, xVars = xVars, includeIntercept = FALSE)
model <- lm(modelForm, data = mpgTest)
summary(model)
        # The new R**2 after removing those columns are 0.9791
        # Very small increase in R**2
        # There was none insignificant attributes

Attempt to fit a model on all of the relevant independent variables (including carName). Explain why this error occurs. Fix this error.

In [None]:
xVars <- names(autompg1)[!names(autompg1) %in% yVar]
modelForm <- createModelFormula(yVar = yVar, xVars = xVars, includeIntercept = FALSE)
model <- lm(modelForm, data = mpgTest)
summary(model)
        # Error is due to x-hat is a non-invertible martrix (singular). 
        # Hence coefficients cannot be computed.
        # This is mostly due to carname is not vectorized.

excl_list <- c('carName')
xVars <- names(autompg1)[!names(autompg1) %in% c(excl_list,yVar)]
modelForm <- createModelFormula(yVar = yVar, xVars = xVars, includeIntercept = FALSE)
model <- lm(modelForm, data = mpgTest)
summary(model)
        # Removing carName from xVars resolves the issue


Determine the relationship between model year and mpg.

In [None]:
        # There is a strong effect between model year(Coef 0.5) and mpg.
        # We noticed that 70-71 model cars are more observed than other years.
        # From scatterplot we observe the mpg increase throughout the years.
plot(autompg1$mpg ~ autompg1$modelYear)

Build the best linear model possible (as measured by R^2 on the test data)

In [None]:
        # from above, we saw that weight and year
        # both are significant. So we can start with these two attributes
        # and try all other combinations of it

DispModelSummary <- function(incl_list){
  xVars <- names(autompg1)[names(autompg1) %in% incl_list]
  modelForm <- createModelFormula(yVar = yVar, xVars = xVars, includeIntercept = FALSE)
  model <- lm(modelForm, data = mpgTest)
  summary(model)
}

incl_list <- c('weight', 'modelYear', 'horsepower', 'acceleration', 'origin', 'cylinders', 'displacement') 
DispModelSummary(incl_list)

        # 5/7 insignificant attributes. 
        # High R-square. Using only 2 attributes so model can tend to overfit.
        # Multiple R-squared:  0.981,	Adjusted R-squared:  0.979 
        # F-statistic: 514.9 on 7 and 70 DF,  p-value: < 2.2e-16

        # Model year and origin does not feel intuitively related to mpg
        # trying other combinations with weight

incl_list <- c('weight', 'horsepower', 'acceleration', 'cylinders', 'displacement') 
DispModelSummary(incl_list)
        # horsepower and acceleration are statistically significant
        # but now weight isn't
        # Multiple R-squared:  0.9519,	Adjusted R-squared:  0.9486 
        # F-statistic: 285.2 on 5 and 72 DF,  p-value: < 2.2e-16

incl_list <- c('weight', 'horsepower', 'acceleration', 'cylinders') 
DispModelSummary(incl_list)
        # now horsepower, acceleration and weight are significant
        # Multiple R-squared:  0.9511,	Adjusted R-squared:  0.9484 
        # F-statistic: 354.6 on 4 and 73 DF,  p-value: < 2.2e-16

incl_list <- c('weight', 'horsepower', 'acceleration', 'displacement') 
DispModelSummary(incl_list)
        # horsepower, acceleration are significant. Doesn't improve model
        # Multiple R-squared:  0.9508,	Adjusted R-squared:  0.9481 
        # F-statistic: 352.4 on 4 and 73 DF,  p-value: < 2.2e-16

incl_list <- c('weight', 'horsepower', 'acceleration') 
DispModelSummary(incl_list)
        # Better model where all three are significant.
        # but doesn't imrpove R square
        # Multiple R-squared:  0.9506,	Adjusted R-squared:  0.9486 
        # F-statistic: 474.5 on 3 and 74 DF,  p-value: < 2.2e-16

        # Adding Year
incl_list <- c('weight', 'horsepower', 'acceleration', 'modelYear') 
DispModelSummary(incl_list)
        # Only 2 significant attribute with high R-square. Doesn't improve model.
        # Multiple R-squared:  0.9801,	Adjusted R-squared:  0.979 
        # F-statistic: 898.6 on 4 and 73 DF,  p-value: < 2.2e-16

        # Adding origin instead of Year
incl_list <- c('weight', 'horsepower', 'acceleration', 'origin') 
DispModelSummary(incl_list)
        # Acceptable model.
        # All 4 significant attributes. Good R-squared.
        # Multiple R-squared:  0.9598,	Adjusted R-squared:  0.9576 
        # F-statistic: 435.4 on 4 and 73 DF,  p-value: < 2.2e-16

        # Adding Year and origin
incl_list <- c('weight', 'horsepower', 'acceleration', 'modelYear', 'origin') 
DispModelSummary(incl_list)
        # same as above. Not much helpful. Doesn't improve model.
        # Multiple R-squared:  0.9803,	Adjusted R-squared:  0.9789 
        # F-statistic: 714.8 on 5 and 72 DF,  p-value: < 2.2e-16

## Best Model:

        # Best model (second last) is using 'weight', 'horsepower', 'acceleration', 'origin'
        # Acceptably high R squared value of 0.9576
        # Attribute combination looks intuitively logical when predicting mpg of an auto

        # 0.0585219 * horsepower + -0.0026494 * weight + 1.4747125 * accleration + 3.2151618 * origin

        # Multiple R-squared:  0.9598,	Adjusted R-squared:  0.9576 
        # F-statistic: 435.4 on 4 and 73 DF,  p-value: < 2.2e-16