#### Config

In [2]:
# Replace with the actual URL for the data file
download_acs <- "https://drive.google.com/uc?export=download&id=14ClBTG0m1UFO-iRYbvom_REKZWmhTmOY" 
download_img <- "https://drive.google.com/uc?export=download&id=1DGR2sAC90UqRkMQZe7LdUQ4MnT5yk_Pk"
download_xml <- "https://drive.google.com/uc?export=download&id=1B2IUgntYGeEY47gcheZe3edOtCdJDgF2"
download_csv <- "https://drive.google.com/uc?export=download&id=1i8bFt76FiFXhizBwyofX-19JBQdmBOAP"
download_edu <- "https://drive.google.com/uc?export=download&id=1xG_DNc6-NNIhsvi3H45l4BVsnQRWS-Bq"
file_name_edu <- "edu.csv"
file_name_csv <- "gdp.csv"
file_name_acs <- "idaho_housing.csv"
file_name_img <- "img.jpg"
file_name_xml <- "Q8_data.xml"

data_dir <- "data/"

if (!dir.exists(data_dir)) {
  dir.create(data_dir)
}

First we download the data we are going to work with.

In [None]:
# uploading the file with data to the host
download.file(download_acs, file.path(data_dir, file_name_acs), mode = "wb")

#### Task 1
**How many properties are worth $1,000,000 or more?** <br>
According to the code book, the variable VAL represents the property value. In this survey, properties with VAL == 24 are those valued at $1,000,000 or more.

In [None]:
housing <- read.csv(file.path(data_dir, file_name_acs))
high_value <- housing$VAL == 24
count_high_value <- sum(high_value, na.rm = TRUE)
print(count_high_value)

[1] 53


#### Task 2
Create a logical vector that identifies the households on greater than 10 acres who sold more than $10,000 worth of agriculture products. Assign that logical vector to the variable agricultureLogical. Apply the which() function to identify the rows of the data frame where the logical vector is TRUE. <br>
**What are the first 3 values that result?**

In [None]:
# Create a logical vector:
agricultureLogical <- (housing$ACR == 3) & (housing$AGS == 6)

# Use which() to get the indices where the condition is TRUE:
resultIndices <- which(agricultureLogical)

# Get the first three indices:
resultIndices[1:3]


#### Task 3
**Apply strsplit() to split all the names of the data frame on the characters "wgtp". What is the value of the 123 element of the resulting list?**

In [None]:
splitNames <- strsplit(names(housing), "wgtp")
splitNames[[123]]

#### Task 4
Using the jpeg package read in the following picture into R.
Use the parameter native=TRUE. What are the 30th and 80th quantiles of the resulting data? (some Linux systems may produce an answer 638 for the 30th quantile which is different from correct answer).

In [None]:
download.file(download_img, file.path(data_dir, file_name_img), mode = "wb")

In [None]:
if (!require("jpeg")) {
    install.packages("jpeg")
}

Loading required package: jpeg



In [None]:
library(jpeg)
img <- readJPEG(file.path(data_dir, file_name_img), native=TRUE)
quantile(img, probs = c(0.3, 0.8))


#### Task 5
Read the XML data on Baltimore restaurants from [here](https://drive.google.com/uc?export=download&id=1B2IUgntYGeEY47gcheZe3edOtCdJDgF2).
How many restaurants have zipcode 21231?

In [None]:
download.file(download_xml, file.path(data_dir, file_name_xml), mode = "wb")

In [None]:
if (!require("xml2")) {
    install.packages("xml2")
}

“installation of package ‘xml2’ had non-zero exit status”
Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done



In [17]:
library(xml2)
doc <- read_xml(file.path(data_dir, file_name_xml))

print(xml_text(xml_find_all(doc, "//name")[1:5]))

[1] "410"                   "1919"                  "SAUTE"                
[4] "#1 CHINESE KITCHEN"    "#1 chinese restaurant"


In [18]:
print(xml_text(xml_find_all(doc, "//zipcode")[1:5]))

[1] "21206" "21231" "21224" "21211" "21223"


In [20]:
restaurants_with_zip <- xml_find_all(doc, "//zipcode[text()='21231']")
num_restaurants <- length(restaurants_with_zip)
cat("Number of restaurants with zipcode 21231:", num_restaurants, "\n")


Number of restaurants with zipcode 21231: 127 


#### Task 6
Download the Gross Domestic Product data for the 190 ranked countries from [here](https://drive.google.com/uc?export=download&id=1i8bFt76FiFXhizBwyofX-19JBQdmBOAP).
Remove the commas from the GDP numbers in millions of dollars and average them. **What is the average?**

In [None]:
download.file(download_csv, file.path(data_dir, file_name_csv), mode = "wb")

In [4]:
gdp_data <- read.csv(file.path(data_dir, file_name_csv ), stringsAsFactors = FALSE)

# Suppose the GDP numbers (in millions of dollars) are in a column named "GDP"
# Remove commas and convert to numeric
gdp_data$GDP <- as.numeric(gsub(",", "", gdp_data$GDP))

# Compute the average GDP
average_gdp <- mean(gdp_data$GDP, na.rm = TRUE)
cat("Average GDP:", average_gdp, "\n")


Average GDP: 377652.4 


#### Task 7
Use the data you loaded from Question 6. Download the educational data from [here](https://drive.google.com/uc?export=download&id=1xG_DNc6-NNIhsvi3H45l4BVsnQRWS-Bq). Match the data based on the country shortcode (column CountryCode).
**Of the countries for which the end of the fiscal year is available (column Special.Notes), how many end in June?**

In [29]:
# Download the educational data from the provided URL
download.file(download_edu, destfile = file.path(data_dir, file_name_edu), mode = "wb")


In [5]:
edu_data <- read.csv(file.path(data_dir, file_name_edu), stringsAsFactors = FALSE)

# Merge the two datasets on CountryCode
merged_data <- merge(gdp_data, edu_data, by = "CountryCode")

# Look at the 'Special.Notes' column for fiscal year end information.
# We assume that if the fiscal year end is available, the Special.Notes column contains a phrase with "June".
june_rows <- grep("June", merged_data$Special.Notes, ignore.case = TRUE)
num_june <- length(june_rows)

cat("Number of countries for which the fiscal year ends in June:", num_june, "\n")


Number of countries for which the fiscal year ends in June: 16 


#### Task 8
Use merged data frame you created in the previous problem.
**How many countries are Lower middle income (column Income.Group) but among the 38 nations with highest GDP?**

In [11]:
# Sort by the Rank column (which holds the GDP ranking)
merged_sorted <- merged_data[order(as.numeric(merged_data$Rank)), ]

# Subset the top 38 countries (i.e. those with the highest GDP)
top38 <- merged_sorted[1:38, ]


# Count how many of these have Income.Group equal to "Lower middle income"
num_lower_middle <- sum(top38$Income.Group == "Lower middle income", na.rm = TRUE)

cat("Number of Lower middle income countries among the top 38 GDP nations:", num_lower_middle, "\n")


Number of Lower middle income countries among the top 38 GDP nations: 5 


### Task 9
Use merged data frame you created in the previous problem.<br>
**What is the average GDP ranking (column Rank) for the "High income: OECD" and "High income: nonECD" group (column Income.Group)?**

In [12]:
# Subset for "High income: OECD"
oecd <- merged_data[merged_data$Income.Group == "High income: OECD", ]
mean_oecd <- mean(as.numeric(oecd$Rank), na.rm = TRUE)

# Subset for "High income: nonOECD"
non_oecd <- merged_data[merged_data$Income.Group == "High income: nonOECD", ]
mean_non_oecd <- mean(as.numeric(non_oecd$Rank), na.rm = TRUE)

cat("Average GDP ranking for High income: OECD:", mean_oecd, "\n")
cat("Average GDP ranking for High income: nonOECD:", mean_non_oecd, "\n")


Average GDP ranking for High income: OECD: 32.96667 
Average GDP ranking for High income: nonOECD: 91.91304 


#### Task 10
Use the quantmod package to get historical stock prices for publicly traded companies on the NASDAQ and NYSE. Use the following code to download data on Amazon's stock price and get the times the data was sampled.
```python
library(quantmod)
amzn <- getSymbols("AMZN", auto.assign=FALSE)
sampleTimes <- index(amzn)
```

**How many values were collected in 2012? How many values were collected on Mondays in 2012?**

In [13]:
if (!require("quantmod")) {
    install.packages("quantmod")
}

Loading required package: quantmod

Loading required package: xts

Loading required package: zoo


Attaching package: ‘zoo’


The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric


Loading required package: TTR

Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 



In [14]:
library(quantmod)
amzn <- getSymbols("AMZN", auto.assign = FALSE)
sampleTimes <- index(amzn)

# Extract dates in 2012
dates2012 <- sampleTimes[format(sampleTimes, "%Y") == "2012"]
num2012 <- length(dates2012)

# Among those, count how many are Mondays
mondays2012 <- dates2012[format(dates2012, "%A") == "Monday"]
numMondays2012 <- length(mondays2012)

cat("Number of values collected in 2012:", num2012, "\n")
cat("Number of values collected on Mondays in 2012:", numMondays2012, "\n")


Number of values collected in 2012: 250 
Number of values collected on Mondays in 2012: 47 
