# Introduction to DataFrames

In [1]:
# What's a data frame?
# You may remember from the chapter about matrices that all the elements that you put in a matrix should be of the 
# same type. Back then, your data set on Star Wars only contained numeric elements.

# When doing a market research survey, however, you often have questions such as:

# 'Are you married?' or 'yes/no' questions (logical)
# 'How old are you?' (numeric)
# 'What is your opinion on this product?' or other 'open-ended' questions (character)
# ...
# The output, namely the respondents' answers to the questions formulated above, is a data set of different data types. 
# You will often find yourself working with data sets that contain different data types instead of only one.

# A data frame has the variables of a data set as columns and the observations as rows. This will be a familiar 
# concept for those coming from different statistical software packages such as SAS or SPSS.

In [2]:
mtcars

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [3]:
# Call head() on mtcars
head(mtcars)

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2
Valiant,18.1,6,225,105,2.76,3.46,20.22,1,0,3,1


In [4]:
# To know the Structure of the Data Frame
str(mtcars)

'data.frame':	32 obs. of  11 variables:
 $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
 $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
 $ disp: num  160 160 108 258 360 ...
 $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
 $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
 $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
 $ qsec: num  16.5 17 18.6 19.4 17 ...
 $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
 $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
 $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
 $ carb: num  4 4 1 1 2 1 4 2 2 4 ...


In [6]:
# Definition of vectors
name <- c("Mercury", "Venus", "Earth", "Mars", "Jupiter", "Saturn", "Uranus", "Neptune")
type <- c("Terrestrial planet", "Terrestrial planet", "Terrestrial planet", 
          "Terrestrial planet", "Gas giant", "Gas giant", "Gas giant", "Gas giant")
diameter <- c(0.382, 0.949, 1, 0.532, 11.209, 9.449, 4.007, 3.883)
rotation <- c(58.64, -243.02, 1, 1.03, 0.41, 0.43, -0.72, 0.67)
rings <- c(FALSE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE)

# Create a data frame from the vectors
planets_df <-data.frame(name,type,diameter,rotation,rings)

planets_df

name,type,diameter,rotation,rings
Mercury,Terrestrial planet,0.382,58.64,False
Venus,Terrestrial planet,0.949,-243.02,False
Earth,Terrestrial planet,1.0,1.0,False
Mars,Terrestrial planet,0.532,1.03,False
Jupiter,Gas giant,11.209,0.41,True
Saturn,Gas giant,9.449,0.43,True
Uranus,Gas giant,4.007,-0.72,True
Neptune,Gas giant,3.883,0.67,True


In [7]:
# Check the structure of planets_df
str(planets_df)

'data.frame':	8 obs. of  5 variables:
 $ name    : Factor w/ 8 levels "Earth","Jupiter",..: 4 8 1 3 2 6 7 5
 $ type    : Factor w/ 2 levels "Gas giant","Terrestrial planet": 2 2 2 2 1 1 1 1
 $ diameter: num  0.382 0.949 1 0.532 11.209 ...
 $ rotation: num  58.64 -243.02 1 1.03 0.41 ...
 $ rings   : logi  FALSE FALSE FALSE FALSE TRUE TRUE ...


In [8]:
# Selection of data frame elements
# Similar to vectors and matrices, you select elements from a data frame with the help of square brackets [ ].
# By using a comma, you can indicate what to select from the rows and the columns respectively. For example:

# my_df[1,2] selects the value at the first row and second column in my_df.
# my_df[1:3,2:4] selects rows 1, 2, 3 and columns 2, 3, 4 in my_df.
# Sometimes you want to select all elements of a row or column. For example, my_df[1, ] 
# selects all elements of the first row. Let us now apply this technique on planets_df!

In [9]:
# Print out diameter of Mercury (row 1, column 3)
planets_df[1,3]

# Print out data for Mars (entire fourth row)
planets_df[4,]

Unnamed: 0,name,type,diameter,rotation,rings
4,Mars,Terrestrial planet,0.532,1.03,False


In [10]:
# Select first 5 values of diameter column

planets_df[1:5,"diameter"]

In [11]:
# Only planets with rings
# You will often want to select an entire column, namely one specific variable from a data frame.
# If you want to select all elements of the variable diameter, for example, both of these will do the trick:

# planets_df[,3]
# planets_df[,"diameter"]
# However, there is a short-cut. If your columns have names, you can use the $ sign:

# planets_df$diameter

In [12]:
# planets_df is pre-loaded in your workspace
planets_df
# Select the rings variable from planets_df
rings_vector <- planets_df$rings
  
# Print out rings_vector

rings_vector

name,type,diameter,rotation,rings
Mercury,Terrestrial planet,0.382,58.64,False
Venus,Terrestrial planet,0.949,-243.02,False
Earth,Terrestrial planet,1.0,1.0,False
Mars,Terrestrial planet,0.532,1.03,False
Jupiter,Gas giant,11.209,0.41,True
Saturn,Gas giant,9.449,0.43,True
Uranus,Gas giant,4.007,-0.72,True
Neptune,Gas giant,3.883,0.67,True


In [14]:
# Adapt the code to select all columns for planets with rings
planets_df[rings_vector, "name"]
planets_df[rings_vector, ]

Unnamed: 0,name,type,diameter,rotation,rings
5,Jupiter,Gas giant,11.209,0.41,True
6,Saturn,Gas giant,9.449,0.43,True
7,Uranus,Gas giant,4.007,-0.72,True
8,Neptune,Gas giant,3.883,0.67,True


In [15]:
# Select planets with diameter < 1
subset(planets_df,subset = diameter < 1)

Unnamed: 0,name,type,diameter,rotation,rings
1,Mercury,Terrestrial planet,0.382,58.64,False
2,Venus,Terrestrial planet,0.949,-243.02,False
4,Mars,Terrestrial planet,0.532,1.03,False


In [20]:
order()

NULL

In [21]:
# Use order() to create positions
positions <- order(planets_df$diameter)

# Use positions to sort planets_df

planets_df[positions,]


Unnamed: 0,name,type,diameter,rotation,rings
1,Mercury,Terrestrial planet,0.382,58.64,False
4,Mars,Terrestrial planet,0.532,1.03,False
2,Venus,Terrestrial planet,0.949,-243.02,False
3,Earth,Terrestrial planet,1.0,1.0,False
8,Neptune,Gas giant,3.883,0.67,True
7,Uranus,Gas giant,4.007,-0.72,True
6,Saturn,Gas giant,9.449,0.43,True
5,Jupiter,Gas giant,11.209,0.41,True
