In [2]:
library(data.table)
myDF <- fread("/anvil/projects/tdm/data/election/itcont1980.txt", quote="")
names(myDF) <- c("CMTE_ID", "AMNDT_IND", "RPT_TP", "TRANSACTION_PGI", "IMAGE_NUM", "TRANSACTION_TP", "ENTITY_TP", "NAME", "CITY", "STATE", "ZIP_CODE", "EMPLOYER", "OCCUPATION", "TRANSACTION_DT", "TRANSACTION_AMT", "OTHER_ID", "TRAN_ID", "FILE_NUM", "MEMO_CD", "MEMO_TEXT", "SUB_ID")

In [4]:
tapply(myDF$TRANSACTION_AMT, myDF$STATE, sum)

In [6]:
video_games <- read.csv("/anvil/projects/tdm/data/grouping/vgchartz-2024.csv")

In [7]:
# Average Critic Score by Genre
avg_critic <- tapply(video_games$critic_score, video_games$genre, mean, na.rm = TRUE)

In [8]:
# View all genres with their average critic scores
avg_critic

In [9]:
# What genre has the highest score 
highest_avg_critic <- avg_critic[which.max(avg_critic)]
highest_avg_critic

In [None]:
# The video game genre that had the highest average critic score is sandbox

In [10]:
# Find the sum of total sales by console
sales_by_console <- tapply(video_games$total_sales, video_games$console, sum, na.rm = TRUE)

In [11]:
# Sort the results in decreasing order
sales_sorted <- sort(sales_by_console, decreasing = TRUE)

In [12]:
sales_sorted[1]

In [None]:
# The PS2 has the highest number of total sales among consoles

In [None]:
# The tapply function applies a calculation to parts of a dataset that are split into different groups. It can find the average critic score for each video game genre or the total sales for each console. This function makes grouping data fast and keeps the code shorter

In [14]:
library(dplyr)
avg_scores <- video_games %>%
group_by(genre) %>%
summarize(avg_critic_score = mean(critic_score, na.rm =TRUE)) %>%
arrange(desc(avg_critic_score)) 


Attaching package: 'dplyr'


The following objects are masked from 'package:data.table':

    between, first, last


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union




In [16]:
library(dplyr)

# calculate both average critic score and number of games per genre
genre_summary <- video_games %>%
group_by(genre) %>%
summarize(
    avg_critic_score = mean(critic_score, na.rm = TRUE),
    num_games = n()
    ) %>%
arrange(desc(avg_critic_score))
genre_summary

genre,avg_critic_score,num_games
<chr>,<dbl>,<int>
Sandbox,9.2,20
Visual Novel,8.5,493
Music,7.992593,297
MMO,7.781818,115
Education,7.75,35
Action-Adventure,7.678378,1877
Role-Playing,7.47092,5721
Strategy,7.431733,3685
Fighting,7.392424,2367
Adventure,7.341058,6260


In [None]:
# I prefer to use dplyer because it is easier to read. The pipe %>% clearly shows each step of grouping, summarizing, and storing.

In [17]:
library(dplyr)
# Group by console and sum of total sales
console_sales <- video_games %>%
group_by(console) %>%
summarize(total_sales = sum(total_sales, na.rm = TRUE)) %>%
arrange(desc(total_sales))

# View the top 20 consoles
head(console_sales, n=20)

# Console with the most sales
top_console <- console_sales$console[1]
top_console

console,total_sales
<chr>,<dbl>
PS2,1027.76
X360,859.79
PS3,839.7
PS,546.25
PS4,539.92
Wii,459.44
DS,458.17
XOne,268.96
PSP,245.29
XB,232.05


In [18]:
# Count number of games per genre
genre_title_counts <- video_games %>%
summarize(total_titles = n()) %>%
arrange(total_titles)

# View the genre with number of titles 
head(genre_title_counts, n = 1)

# Save the genre with the least titles 
least_genre <- genre_title_counts$genre[1]
least_genre

Unnamed: 0_level_0,total_titles
Unnamed: 0_level_1,<int>
1,64016


NULL

In [21]:
# Total sales per genre and console
genre_console_sales <- video_games %>%
  group_by(genre, console) %>%
  summarize(total_sales = sum(total_sales, na.rm = TRUE)) %>%
  arrange(desc(total_sales))

# View the top 20 combos
head(genre_console_sales, n = 20)genre_console_sales <- video_games %>%
  group_by(genre, console) %>%
  summarize(total_sales = sum(total_sales, na.rm = TRUE)) %>%
  arrange(desc(total_sales))

head(genre_console_sales, n = 20) 

[1m[22m`summarise()` has grouped output by 'genre'. You can override using the
`.groups` argument.


genre,console,total_sales
<chr>,<chr>,<dbl>
Sports,PS2,266.38
Shooter,X360,231.35
Action,PS3,191.66
Shooter,PS3,189.53
Action,PS2,171.01
Action,X360,161.9
Sports,PS3,153.41
Shooter,PS4,144.95
Sports,X360,141.09
Racing,PS2,129.79


In [22]:
genre_console_sales <- video_games %>%
  group_by(genre, console) %>%
  summarize(total_sales = sum(total_sales, na.rm = TRUE)) %>%
  arrange(desc(total_sales))

head(genre_console_sales, n = 20) 

[1m[22m`summarise()` has grouped output by 'genre'. You can override using the
`.groups` argument.


genre,console,total_sales
<chr>,<chr>,<dbl>
Sports,PS2,266.38
Shooter,X360,231.35
Action,PS3,191.66
Shooter,PS3,189.53
Action,PS2,171.01
Action,X360,161.9
Sports,PS3,153.41
Shooter,PS4,144.95
Sports,X360,141.09
Racing,PS2,129.79


In [23]:
# Load the dataset
app_store <- read.csv("/anvil/projects/tdm/data/grouping/googleplaystore.csv")


In [24]:
# Check the columns
head(app_store[, c("Rating", "Reviews")])

Unnamed: 0_level_0,Rating,Reviews
Unnamed: 0_level_1,<dbl>,<chr>
1,4.1,159
2,3.9,967
3,4.7,87510
4,4.5,215644
5,4.3,967
6,4.4,167


In [25]:
# Remove the commas 
app_store$Clean_Reviews <- gsub(",", "", app_store$Reviews)

# Convert to numeric
app_store$Numeric_Reviews <- as.numeric(app_store$Clean_Reviews)

# Check for problematic values
app_store[is.na(app_store$Numeric_Reviews) & !is.na(app_store$Reviews), "Reviews"]

"NAs introduced by coercion"


In [28]:
# Load dplyr for data manipulation
library(dplyr)

# Read in the Google Play Store dataset
app_store <- read.csv('/anvil/projects/tdm/data/grouping/googleplaystore.csv')

# Create Numeric_Reviews using basic R (for reference)
app_store$Clean_Reviews <- gsub(",", "", app_store$Reviews)
app_store$Numeric_Reviews <- as.numeric(app_store$Clean_Reviews)

# Check for problematic values
problem_values <- app_store[is.na(app_store$Numeric_Reviews) & !is.na(app_store$Reviews), "Reviews"]
problem_values  # Should show "3.0M"

# Clean the dataset using dplyr, remove problematic rows, and create numeric column
app_store_cleaned <- app_store %>%
  mutate(
    Clean_Reviews = gsub(",", "", Reviews),
    Numeric_Reviews = suppressWarnings(as.numeric(Clean_Reviews))  # suppress warnings from coercion
  ) %>%
  filter(!is.na(Numeric_Reviews))  # remove problematic "3.0M" value

# Verify no problematic values remain
app_store_cleaned %>%
  filter(is.na(Numeric_Reviews) & !is.na(Reviews)) %>%
  select(Reviews)  # Should return zero rows

# Create rating_per_review column
app_store_cleaned <- app_store_cleaned %>%
  mutate(
    rating_per_review = Rating / Numeric_Reviews
  )

# View the first few values of rating_per_review
head(app_store_cleaned$rating_per_review)

"NAs introduced by coercion"


Reviews
<chr>


In [29]:
library(dplyr)

app_store_cleaned <- app_store_cleaned %>%
  mutate(
    rating_per_review = Rating / Numeric_Reviews
  )

In [30]:
head(app_store_cleaned$rating_per_review)

In [31]:
# Use sapply to get class of each column
column_classes_sapply <- sapply(app_store_cleaned, class)
column_classes_sapply

In [32]:
# Number of characters using lapply (returns a list)
char_count_lapply <- lapply(app_store_cleaned$Content.Rating, nchar)
head(char_count_lapply, n = 30)

In [None]:
# The appple() function is used to apply a function to the rows or columns of a dataframe. It is helpful when performing calculations on every row or column without needing to write a loop. This makes it easier to calculate things like row sums, means, or other summary items