In [1]:
# This is a notebook with codes/scripts used during the classes of
# data analysis with R
# Rodolfo Souza - rodolfomssouza@gmail.com

In [2]:
# Introduction to R

In [3]:
# Elements in R -----------------------------------------------------------
# Vector
x1 = c(1, 2, 3, 4, 5, 6, 7, 8, 9)
x1

In [4]:
# Sequence
x2 = seq(1, 4.5, 0.5) # same of x2 = seq(from = 1, to = 4.5, by = 0.5)
x2

In [5]:
# Repetition
x3 = rep('a', 5)
x3

In [6]:
x4 = rep(x1, 3)
x4

In [7]:
x5 = rep(x1, each = 3)
x5

In [8]:
# Matrix
m1 = matrix(ncol = 5, nrow = 4)
m1

0,1,2,3,4
,,,,
,,,,
,,,,
,,,,


In [9]:
m2 = matrix(ncol = 5, nrow = 4, data = seq(1, 5, length=20))
m2

0,1,2,3,4
1.0,1.842105,2.684211,3.526316,4.368421
1.210526,2.052632,2.894737,3.736842,4.578947
1.421053,2.263158,3.105263,3.947368,4.789474
1.631579,2.473684,3.315789,4.157895,5.0


In [10]:
m3 = matrix(ncol = 5, nrow = 4, byrow = T, data = seq(1, 5, length=20))
m3

0,1,2,3,4
1.0,1.210526,1.421053,1.631579,1.842105
2.052632,2.263158,2.473684,2.684211,2.894737
3.105263,3.315789,3.526316,3.736842,3.947368
4.157895,4.368421,4.578947,4.789474,5.0


In [11]:
# Work directory ----------------------------------------------------------
# Show work directory
getwd()

In [12]:
# Change work directory
setwd('~/')                         # Returns to the root directory
setwd('Programming/R/Notebooks')    # Enter in a new work direcory

In [13]:
# Packages ----------------------------------------------------------------
# Install packages
install.packages('fBascis', dep = T)

Installing package into ‘/home/rodolfo/R/x86_64-redhat-linux-gnu-library/3.5’
(as ‘lib’ is unspecified)
“package ‘fBascis’ is not available (for R version 3.5.3)”

In [14]:
# Load packages
library('fBasics')          # library and require have the same result!
require('hydroGOF')

Loading required package: timeDate
Loading required package: timeSeries
Loading required package: hydroGOF
Loading required package: zoo

Attaching package: ‘zoo’

The following object is masked from ‘package:timeSeries’:

    time<-

The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric



In [15]:
# Data in R ---------------------------------------------------------------
# Create random data following a normal distribution
xr1 = rnorm(n = 30, mean = 10, sd = 2)
xr1

In [16]:
# Read a table
df1 = read.table('GEOR_2016_120_Data.txt', h = T)

In [17]:
head(df1[1:9], n = 5)    # Show the first five elements of the colunms 1 to 9

X,Y,ThetaS,ThetaAvb,PhiMicro,PhiMeso,PhiMacro,Ds,S0
436550,9106770,0.33,0.106,62.65,11.56,25.79,1.51,0.3288
436629,9106770,0.35,0.117,56.85,13.35,29.8,1.57,0.4254
436709,9106752,0.48,0.141,80.11,8.22,11.67,1.39,0.2916
436790,9106760,0.5,0.124,79.04,7.17,13.79,1.21,0.5414
436861,9106760,0.49,0.11,79.09,6.48,14.43,1.35,0.2831


In [18]:
# Access variable from dataframe
df1$ThetaS    # Use the "$" symbol to access a column by its name

In [19]:
df1[3]    # A different way to access the same column

ThetaS
0.33
0.35
0.48
0.5
0.49
0.36
0.42
0.43
0.46
0.51


In [20]:
# Names of all columns of the dataframe
names(df1)

In [21]:
# Number of columns of the dataframe
length(df1)

In [22]:
# Attach data from dataframe to R
attach(df1)

In [23]:
ThetaS    # Now "ThetaS" exists in R

In [24]:
# Pivot table -------------------------------------------------------------

In [1]:
# Load data at hour scale
df2 = read.table('Data_met_ST_hourly.txt', h = T)

In [3]:
head(df2, n = 5)

Day,Month,Year,Hour,temp_inst,temp_max,temp_min,rH_inst,rH_max,rH_min,dew_point_inst,dew_point_max,dew_point_min,pressure,pressure_max,pressure_min,wind_direction,windspeed,radiation,rain
3,7,2017,0,21.7,22.4,21.7,73,73,68,16.7,16.7,16.3,962.7,962.7,962.0,170,3.1,-3.04,0
3,7,2017,1,21.5,21.8,21.5,76,76,73,17.2,17.2,16.6,962.6,962.7,962.6,162,2.0,-2.18,0
3,7,2017,2,20.5,21.6,20.5,84,84,76,17.7,17.7,17.1,963.0,963.0,962.6,158,2.4,-2.62,0
3,7,2017,3,20.9,21.0,20.5,76,84,76,16.6,17.6,16.6,963.0,963.0,962.9,165,3.7,-2.2,0
3,7,2017,4,20.6,20.9,20.6,78,78,76,16.6,16.7,16.5,962.7,963.0,962.7,166,3.4,-1.97,0


In [5]:
tair_mnt = tapply(df2$temp_inst, list(Month=df2$Month, Year = df2$Year), mean, na.rm = T)

In [6]:
tair_mnt

2017,2018
,27.3582
,26.2689
,25.61492
,24.40847
,24.58642
,24.86169
21.30891,
23.7629,
24.18986,
26.74503,


In [9]:
# Filtering data ----------------------------------------------------------
id1 = which(df2$Day==1 & df2$Year==2017) # Take the first days of all months in 2017
id1                                      # The positions of first days

In [11]:
mean(df2$temp_inst[id1])    # The mean of the temperature of all first days

In [12]:
# Summarize data
summary(df2)

      Day            Month             Year           Hour      
 Min.   : 1.00   Min.   : 1.000   Min.   :2017   Min.   : 0.00  
 1st Qu.: 8.00   1st Qu.: 3.000   1st Qu.:2017   1st Qu.: 5.00  
 Median :15.00   Median : 7.000   Median :2017   Median :11.00  
 Mean   :15.52   Mean   : 6.553   Mean   :2017   Mean   :11.49  
 3rd Qu.:23.00   3rd Qu.:10.000   3rd Qu.:2018   3rd Qu.:17.00  
 Max.   :31.00   Max.   :12.000   Max.   :2018   Max.   :23.00  
   temp_inst        temp_max        temp_min        rH_inst     
 Min.   :15.80   Min.   :16.20   Min.   :14.80   Min.   :10.00  
 1st Qu.:22.30   1st Qu.:22.80   1st Qu.:21.90   1st Qu.:40.00  
 Median :24.90   Median :25.70   Median :24.20   Median :59.00  
 Mean   :25.57   Mean   :26.26   Mean   :24.92   Mean   :56.26  
 3rd Qu.:28.70   3rd Qu.:29.60   3rd Qu.:27.80   3rd Qu.:73.00  
 Max.   :37.00   Max.   :37.90   Max.   :36.30   Max.   :91.00  
     rH_max          rH_min      dew_point_inst  dew_point_max  
 Min.   :11.00   Min.   :