# Data Frame

Data frame é uma estrutura de dados bidimensional em R. É uma caso especial de uma lista que possui componentes de tamanhos iguais. Cada compontente forma uma coluna.

In [3]:
x = data.frame("id"=1:3, "idade"= c(32, 21, 30)); x

id,idade
1,32
2,21
3,30


In [5]:
data.frame("id"=1:3, "idade"= c(32, 23, 30))

id,idade
1,32
2,23
3,30


In [4]:
typeof(x)

In [5]:
class(x)

## Funções do Data Frame

In [8]:
names(x)

In [14]:
ncol(x)

In [15]:
nrow(x)

In [13]:
length(x)

## Criando um Data Frame

In [6]:
x = data.frame("id" = 1:2, "idade" = c(21,15), "nome" = c("José Lima", "Dória Silva")); str(x)

'data.frame':	2 obs. of  3 variables:
 $ id   : int  1 2
 $ idade: num  21 15
 $ nome : Factor w/ 2 levels "Dória Silva",..: 2 1


In [7]:
x = data.frame("id" = 1:2, "idade" = c(21,15), "nome" = c("José Lima","Dória Silva"), 
               stringsAsFactors=FALSE); str(x)

'data.frame':	2 obs. of  3 variables:
 $ id   : int  1 2
 $ idade: num  21 15
 $ nome : chr  "José Lima" "Dória Silva"


## Lidando com valores nulos

In [8]:
#fillna
x = data.frame("id" = 1:3, "idade" = c(21, NA, 22), "nome" = c("José Lima","Dória Silva", "Raphael Campos")); str(x)#

'data.frame':	3 obs. of  3 variables:
 $ id   : int  1 2 3
 $ idade: num  21 NA 22
 $ nome : Factor w/ 3 levels "Dória Silva",..: 2 1 3


In [14]:
sum(x[,'idade']) / length(x[,'idade'])

In [16]:
!is.na(x[,'idade'])
mean(x[!is.na(x[,'idade']), 'idade'])

In [31]:
mean(x[!is.na(x[,'idade']), 'idade'])

In [17]:
x[is.na(x[,'idade']), 'idade'] = mean(x[!is.na(x[,'idade']), 'idade'])

In [18]:
x

id,idade,nome
1,21.0,José Lima
2,21.5,Dória Silva
3,22.0,Raphael Campos


### Lendo Data Frame de um Arquivo CSV

In [19]:
?read.csv

In [56]:
df = read.csv("../dados/iris-dataset.csv", header=FALSE)

In [22]:
head(df, n=10)

V1,V2,V3,V4,V5
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa
5.4,3.9,1.7,0.4,Iris-setosa
4.6,3.4,1.4,0.3,Iris-setosa
5.0,3.4,1.5,0.2,Iris-setosa
4.4,2.9,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa


In [23]:
tail(df, n=10)

Unnamed: 0,V1,V2,V3,V4,V5
141,6.7,3.1,5.6,2.4,Iris-virginica
142,6.9,3.1,5.1,2.3,Iris-virginica
143,5.8,2.7,5.1,1.9,Iris-virginica
144,6.8,3.2,5.9,2.3,Iris-virginica
145,6.7,3.3,5.7,2.5,Iris-virginica
146,6.7,3.0,5.2,2.3,Iris-virginica
147,6.3,2.5,5.0,1.9,Iris-virginica
148,6.5,3.0,5.2,2.0,Iris-virginica
149,6.2,3.4,5.4,2.3,Iris-virginica
150,5.9,3.0,5.1,1.8,Iris-virginica


In [24]:
# python era df.describe()
summary(df)

       V1              V2              V3              V4       
 Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
 1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
 Median :5.800   Median :3.000   Median :4.350   Median :1.300  
 Mean   :5.843   Mean   :3.054   Mean   :3.759   Mean   :1.199  
 3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
 Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
               V5    
 Iris-setosa    :50  
 Iris-versicolor:50  
 Iris-virginica :50  
                     
                     
                     

### Alterando o nome das colunas

In [57]:
colnames(df) = c("sepal_length", "sepal_width", "petal_length", "petal_width", "species")
head(df)

sepal_length,sepal_width,petal_length,petal_width,species
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa
5.4,3.9,1.7,0.4,Iris-setosa


### Indexação

#### Acessando como uma lista

In [28]:
head(df["sepal_length"])
typeof(df["sepal_length"])
class(df["sepal_length"])

sepal_length
5.1
4.9
4.7
4.6
5.0
5.4


In [35]:
typeof(df[['sepal_length']])
df[['sepal_length']][1:4]

In [36]:
df$sepal_length
typeof(df$sepal_length)
class(df$sepal_length)

In [24]:
df[["sepal_length"]]
typeof(df[["sepal_length"]])

#### Acessando como uma Matriz

In [37]:
head(df, n = 10)

sepal_length,sepal_width,petal_length,sepal_width.1,species
5.1,3.5,1.4,0.2,Iris-setosa
4.9,3.0,1.4,0.2,Iris-setosa
4.7,3.2,1.3,0.2,Iris-setosa
4.6,3.1,1.5,0.2,Iris-setosa
5.0,3.6,1.4,0.2,Iris-setosa
5.4,3.9,1.7,0.4,Iris-setosa
4.6,3.4,1.4,0.3,Iris-setosa
5.0,3.4,1.5,0.2,Iris-setosa
4.4,2.9,1.4,0.2,Iris-setosa
4.9,3.1,1.5,0.1,Iris-setosa


In [38]:
df[1:5, c('species', 'petal_length', 'petal_length')]

species,petal_length,petal_length.1
Iris-setosa,1.4,1.4
Iris-setosa,1.4,1.4
Iris-setosa,1.3,1.3
Iris-setosa,1.5,1.5
Iris-setosa,1.4,1.4


In [48]:
df[df$sepal_length < 5.0, 'petal_width', drop=FALSE]

Unnamed: 0,petal_width
2,0.2
3,0.2
4,0.2
7,0.3
9,0.2
10,0.1
12,0.2
13,0.1
14,0.1
23,0.2


In [56]:
df[1:3, 2]

In [57]:
df[1:3, 'sepal_width']

In [61]:
df[1:3, 2, drop = FALSE]

sepal_width
3.5
3.0
3.2


In [49]:
head(df, n = 1)
df[1, "petal_length"] = 2
head(df, n = 1)

sepal_length,sepal_width,petal_length,petal_width,species
5.1,3.5,1.4,0.2,Iris-setosa


sepal_length,sepal_width,petal_length,petal_width,species
5.1,3.5,2,0.2,Iris-setosa


### Adicionando componentes

In [50]:
df = cbind(list(5.06, 3.6, 1.0, 0.25, "Iris-setosa"), df)
head(df)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
2,5.06,3.6,1.0,0.25,Iris-setosa
210,5.1,3.5,2.0,0.2,Iris-setosa
3,4.9,3.0,1.4,0.2,Iris-setosa
4,4.7,3.2,1.3,0.2,Iris-setosa
5,4.6,3.1,1.5,0.2,Iris-setosa
6,5.0,3.6,1.4,0.2,Iris-setosa


In [59]:
c(4, 5, 6, 7) + 4 # c(4, 4, 4, 4)

In [60]:
c(1:50, 1:50, 1:50)

In [61]:
df$nova = 1:50
df

sepal_length,sepal_width,petal_length,petal_width,species,nova
5.1,3.5,1.4,0.2,Iris-setosa,1
4.9,3.0,1.4,0.2,Iris-setosa,2
4.7,3.2,1.3,0.2,Iris-setosa,3
4.6,3.1,1.5,0.2,Iris-setosa,4
5.0,3.6,1.4,0.2,Iris-setosa,5
5.4,3.9,1.7,0.4,Iris-setosa,6
4.6,3.4,1.4,0.3,Iris-setosa,7
5.0,3.4,1.5,0.2,Iris-setosa,8
4.4,2.9,1.4,0.2,Iris-setosa,9
4.9,3.1,1.5,0.1,Iris-setosa,10


#### Removendo componetentes

In [67]:
#df = NULL
tail(df)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
145,6.7,3.3,5.7,2.5
146,6.7,3.0,5.2,2.3
147,6.3,2.5,5.0,1.9
148,6.5,3.0,5.2,2.0
149,6.2,3.4,5.4,2.3
150,5.9,3.0,5.1,1.8
