In [None]:
## Load packages
library(ggplot2) # powerful visualization library
library(dplyr) # powerful data wrangling library

In [None]:
## Basic datatypes in R
# List
alist = list(1, 'Name', c('a','b', 'c')) # the third element of the list is an array
# Array
anarray = c(1, 2, 3, 4) # an array with 4 elements
# Matrix
amatrix = matrix(anarray, nrow = 2, ncol = 2)

In [None]:
print(alist)

In [None]:
anewlist = list(1, 'Name', list('a','b', 'c'))
print(anewlist)
print(str(anewlist))

In [None]:
# Modify slot of a list
anewlist = list(1, 'Name', c('a','b', 'c'))
anewlist[1] = 10
print(anewlist)

In [None]:
# Modify slot of a list
anewlist = list(1, 'Name', c('a','b', 'c'))
anewlist[[1]] = 10
print(anewlist)

In [None]:
# Modify slot of a list
anewlist = list(1, 'Name', c('a','b', 'c'))
anewlist[3] = 10
print(anewlist)

In [None]:
# Modify slot of a list
anewlist = list(1, 'Name', c('a','b', 'c'))
anewlist[[3]] = 10
print(anewlist)

In [None]:
print(anarray)

In [None]:
str(anarray)

In [None]:
anarray[1]

In [None]:
print(amatrix)

In [None]:
## Accessing elements of a 1D-array
anarray[1]

## Accessing elements of a 2D-array
amatrix[1, 1]

In [None]:
# Length and dimension
length(alist)
length(anarray)
dim(amatrix)

In [None]:
# Structure of an object
str(alist)
str(anarray)
str(amatrix)

In [None]:
# Accessing elements of a list (indexing starts from 1 in R)
alist[2]
alist[[2]]
str(alist[2])
str(alist[[2]])

In [None]:
# Accessing elements of a list (indexing starts from 1 in R)
alist[3] # 3rd element of the list
alist[[3]] # 3rd element of the list

In [None]:
str(alist[3]) # structure of 3rd element of the list accessed using []

In [None]:
str(alist[[3]]) # structure of 3rd element of the list accessed using [[]]

In [None]:
alist[3][1]

In [None]:
alist[[3]][1]

In [None]:
# Slot is an attribute/property of an object
# price, numberDoors, typeEngine and mileage are slots of the class "Car".
setClass('Car', representation = representation(
   price = 'numeric',
   numberDoors = 'numeric',
   typeEngine = 'character',
   mileage = 'numeric'
))
aCar = new('Car' , price = 20000, numberDoors = 4, typeEngine = 'V6', mileage = 143)
aCar

In [None]:
str(aCar)

In [None]:
# Slots can be accessed using the @ symbol 
aCar@price 

In [None]:
# Slots can also be accessed using the in-built 'slot' function
slot(aCar,'price')

In [None]:
# Get the names of the slots
slotNames(aCar)

In [None]:
# So how are classes implemented in R?
?Classes_Details

In [None]:
# Create a dataframe
Name = c('A', 'B', 'C')
Age = c(21, 22, 20)
Height = c(165, 170, 164)
sData = data.frame(Name, Age, Height)
rownames(sData) = c('Ajith', 'Vishnu', 'John')

In [None]:
head(sData)

In [None]:
str(sData)

In [None]:
# Accessing elements of a data frame using $
sData$Age

In [None]:
# Accessing elements of a data frame using [[]]
sData[['Age']]

In [None]:
# Accessing elements of a data frame using []
sData['Age']

In [None]:
## Extract the age of Ajith
sData['Age']['Ajith', 'Age']
sData['Age']['Ajith', 1]
sData['Age'][1, 1]
sData$Age[1]
sData[['Age']][1]

In [None]:
str(sData$Age)
str(sData[['Age']])
str(sData['Age'])

In [None]:
mean(sData$Age)
mean(sData[['Age']])
mean(sData['Age']) # This results in an incorrect output

In [None]:
# Accessing multiple columns
sData['Age', 'Height']

In [None]:
# Accessing multiple columns
sData[c('Age', 'Height')]

In [None]:
# Accessing multiple columns
sData[[c('Age', 'Height')]]

In [None]:
S = as.matrix(sData[c('Age', 'Height')])
print(S)
str(S)

In [None]:
str(as.integer(sData[['Age']])) # change in-place?

In [None]:
S[, 1]
mean(S[, 1])

In [None]:
file = 'http://openmv.net/file/food-texture.csv'
foodData = read.csv(file, header = TRUE, row.names = 1)

In [None]:
head(foodData)

In [None]:
# Dimension of food data frame
dim(foodData)

In [None]:
# Structure of food data frame
str(foodData)

In [None]:
# Make a scatter plot betwen Density (x-axis) and Oil (y-axis)
p = ggplot(data = foodData, aes(x = Density, y = Oil)) + geom_point(size = 1)
p

In [None]:
str(foodData$Crispy)

In [None]:
unique(foodData$Crispy)

In [None]:
# The factor() function
factor(foodData$Crispy) # create levels out of categorical data

In [None]:
# Make a scatter plot betwen Density (x-axis) and Oil (y-axis)
p1 = ggplot(data = foodData, aes(x = Density, y = Oil, color = factor(Crispy))) + geom_point(size = 1)
p1

In [None]:
cor(foodData$Density, foodData$Oil, method = 'pearson')

In [None]:
# Make a scatter plot betwen Hardness (x-axis) and Oil (y-axis)
p2 = ggplot(data = foodData, aes(x = Hardness, y = Oil, color = factor(Crispy))) + geom_point(size = 2)
p2

In [None]:
cor(foodData$Hardness, foodData$Oil, method = 'pearson')

In [None]:
head(foodData, 3)
dim(foodData)

In [None]:
colnames(foodData)

In [None]:
# Functions that can applied to columns of a data frame
mean(foodData$Density)

In [None]:
# Functions that can applied to columns of a data frame
apply(foodData, 2, mean)

In [None]:
# Functions that can applied to columns of a data frame
apply(foodData[c('Oil', 'Density')], 2, mean)

In [None]:
str(apply(foodData, 2, mean))

In [None]:
# Functions that can applied to columns of a data frame
lapply(foodData, mean)
str(lapply(foodData, mean))

In [None]:
# Exercise using apply() and lapply()
(apply(foodData, 2, mean))['Oil']
(apply(foodData, 2, mean))[1]
(lapply(foodData, mean))[1]
(lapply(foodData, mean))[[1]]
(lapply(foodData, mean))['Oil']
(lapply(foodData, mean))[['Oil']]

In [None]:
# Building a data matrix from a data frame
X = as.matrix(foodData)
dim(X)
str(X)

In [None]:
str(foodData) # compare with str(X) above

In [None]:
X

In [None]:
# Accessing elements of matrix
X[, 1] # elements in the first column of matrix X
X[1, ] # elements in the first row of matrix X

In [None]:
str(X[, 1])
str(X[1, ])

In [None]:
## Convert the 'Crispy' column into a factor (categorical) column
apply(foodData['Crispy'], 2, as.factor)

In [None]:
## Convert the 'Crispy' column into a factor (categorical) column
#foodData$Crispy = as.factor(foodData$Crispy)

In [None]:
#str(foodData)

In [None]:
## Re-assign 'Crispy' column to numeric
#foodData$Crispy = as.integer(foodData$Crispy)

In [None]:
# Make a histogram of crispy values (categorical)
p3 = ggplot(data = foodData, aes(x = Crispy)) + geom_histogram(color = 'black', fill = 'blue')
p3

In [None]:
# Make a histogram of oil values (continuous)
p3 = ggplot(data = foodData, aes(x = Oil)) + geom_histogram(color = 'black', fill = 'blue')
p3

In [None]:
# We are going to modify data frame
# Rename Oil column to OilPercentage
#foodData = foodData %>% rename(OilPercentage = Oil)

# Get sample values for which Crispy > 13 (filtering samples)
#foodData %>% filter(Crispy > 13) 

# Get sample values for which Crispy > 13 and Oil < 18 (filtering samples)
#foodData %>% filter(Crispy > 13 & Oil < 18) 

# Get all samples values for only Density and Crispy columns (selecting columns)
#foodData %>% select(c('Density', 'Crispy'))

# Get all samples values excluding Density and Crispy columns (selecting columns)
foodData %>% select(-c('Density', 'Crispy'))

In [None]:
# Exercise: get sample values for which crispy is between 10 and 13 (inclusive)
# and display Oil, Crispy, and Hardness for those samples 
foodData %>% filter(Crispy >= 10 & Crispy <= 13) %>% select(c('Oil', 'Crispy', 'Hardness'))

In [None]:
# Select all features except Hardness
#foodData %>% select(-Hardness)

# Select all features except Fracture and Hardness
foodData %>% select(-c(Hardness, Fracture))

In [None]:
# Filter samples with Crispy 7, 8, 9, 14 and 15 
foodData %>% filter(Crispy %in% c(7, 8, 9, 14, 15))

In [None]:
# Remove samples with Crispy 7, 8, 9, 14 and 15 
foodData %>% filter(!(Crispy %in% c(7, 8, 9, 14, 15)))

In [None]:
if (foodData$Crispy <= 10) {
  print('low')
}
else if(foodData$Crispy > 10 & foodData$Crispy <=13) {
  print('medium')
}
else {
  print('high')
}

In [None]:
# We want to modify Crispy to 'low' (7 < = Crispy <= 11) or 'high' (12 <= Crispy <= 15)
foodData %>% mutate(Crispy = ifelse(Crispy >= 7 & Crispy <= 11, 'low', 'high'))

In [None]:
 simulatedData = sample(c('H', 'T'), size = 10, replace = TRUE, prob = c(0.5, 0.5))
 #print(simulatedData)
 print(mean(simulatedData == 'T'))

In [None]:
simulatedData
table(simulatedData)
as.data.frame(table(simulatedData))

In [None]:
c(1:6)

In [None]:
 simulatedData = sample(c(1:6), size = 1e06, replace = TRUE, prob = c(1/6, 1/6, 1/6, 1/6, 1/6, 1/6))
 #print(simulatedData)
 print(mean(simulatedData == 1))

In [None]:
# Sampling- (as well as sample-) space for a single fair-coin toss
S = c('H', 'T')

# Corresponding probabilities
p = c(0.5, 0.5)

# Samples representing outcomes of the experiment of selecting 1 object from
# the sampling space with replacement - this corresponds to genrating outcomes
# of the random experiment of tossing a single fair coin
#set.seed(1)
nsamples = 1e04
simulatedData = sample(S, size = nsamples, replace = TRUE, prob = p)

# Create dataframe comprising frequency of each possible outcome in the sample space
df= as.data.frame(table(simulatedData))
colnames(df) = c('Toss', 'Frequency')
print(df)

# Frequency plot
p = ggplot(data = df, aes(x = Toss, y = Frequency)) +
  geom_bar(stat = 'identity', fill = "steelblue") +
  geom_text(aes(label = Frequency), vjust = -0.3, size = 5) +
  theme_minimal()

p

In [None]:
# Sampling- (as well as sample-) space for rolling a single fair die
S = c(1:6)

# Corresponding probabilities
p = (1/6)*c(1, 1, 1, 1, 1, 1)

simulatedData = replicate(10, sample(S, size = 2, replace = TRUE, prob = p))
print(simulatedData)

simulatedData[1, ] + simulatedData[2, ] 

(simulatedData[1, ] + simulatedData[2, ]) == 7

mean((simulatedData[1, ] + simulatedData[2, ]) == 7)

In [None]:
# Sampling- (as well as sample-) space for rolling a single fair die
S = c(1:6)

# Corresponding probabilities
p = (1/6)*c(1, 1, 1, 1, 1, 1)

# Samples representing outcomes of the experiment of selecting 2 objects from
# the sampling space with replacement - this corresponds to generating outcomes
# of the random experiment of rolling two fair dice
nsamples = 1e04
simulatedData = replicate(nsamples, sample(S, size = 2, replace = TRUE, prob = p))
#print(simulatedData)

# Fraction of outcomes where the sum of the rolls is equal to 7
mean(simulatedData[1, ] + simulatedData[2, ] == 7)

# Fraction of outcomes where the sum of the rolls is equal to 7
mean(simulatedData[1, ] + simulatedData[2, ] >= 7)

In [None]:
# Sampling space for rolling a single fair die
S = c(1:6)

# Corresponding probabilities
p = (1/6)*c(1, 1, 1, 1, 1, 1)

# Samples representing outcomes of the experiment of selecting 6 objects from
# the sampling space with replacement - this corresponds to generating outcomes
# of the random experiment of rolling a fair die 6 times
samplesize = 1e06
simulatedData = replicate(samplesize, sample(S, size = 6, replace = TRUE, prob = p))

checkEvent = function(data){
    unique_numbers = unique(data) # collecting all the unique values occured
    for (val in unique_numbers){
      if (sum(val == data) != 2){
        return(0)
      }
    }    
    return (1)
}

approximate_probability = sum(apply(simulatedData, 2, checkEvent)) / samplesize
cat("Approximate probability of 3 numbers appearing twice each is ", approximate_probability, '\n')