# Introduction

The objective of this exploratory data analysis (EDA) is to gain insights into the trends of power generation in the United States from 2001-2021, with a focus on renewable. I aim to analyze the datasets, which include information on power generation across different states, types of producers, and energy sources. The primary questions I seek to answer are:

##### 1. Temporal Trends:

-   How has the overall power generation evolved over the years?
-   How has the overall renewable power generation evolved over the years? In comparison to nonrenewable power generation?

##### 2. Energy Type Comparison:

-   what is the total generation of different types of renewable energy sources?
-   How does the contribution of different types of renewable energy sources vary over time? Are certain types of energy sources gaining prominence compared to others?

##### 3. Producer Sector Analysis:

-   What is the total energy generated by the type of producer? Is there any noticeable trend?
-   What is the sub-sector noticeable trend for the major type of producer?

##### 4. State-wise Analysis:

-   Top 5 states in renewable generation?
-   What are the trends in renewable power generation for each of the top states? Are there states that stand out in terms of consistent growth or fluctuations?
-   What is the major renewable generation source for each of the top states?

### Data Source

The dataset was downloaded from kaggle in CSV file format

## Data Cleaning and exploration


```{r}
# Load required libraries
library(tidyverse)

```

```{r}
# Load data files
gen_load <- read_csv("organised_Gen.csv") ## power generation data
state <- read_csv("states.csv") %>% select(State, Code) ## state data contain State code and name
```

```{r}
head(gen_load)
```

```{r}
str(gen_load)
```

```{r}
# Join state data with energy generation data and exclude STATE column
gen <- gen_load %>% inner_join(state, by = c("STATE" = "Code")) %>% 
  select(-STATE)
```

```{r}
# Check for missing values in the generated dataset
colSums(is.na(gen))
```

```{r}
# Rename columns for clarity
gen_rename <- gen %>%
  rename(
    "TYPE_OF_PRODUCER" = trimws("TYPE OF PRODUCER"),
    "ENERGY_SOURCE" = trimws("ENERGY SOURCE"),
    "GENERATION_Megawatthours" = trimws("GENERATION (Megawatthours)"))
```

```{r}
# Filter out total values for each type of producer
gen_filter <- gen_rename %>%
  filter(!grepl("^Total.*", TYPE_OF_PRODUCER))
```

```{r}
# Filter out 'Total' from the Energy source column
gen_filter_source <- gen_filter %>% filter(ENERGY_SOURCE != "Total")
```

```{r}
# Split 'type of producer' into 'producer' and 'company'
producer_split <- gen_filter_source %>% mutate(
  energy_producer = word(TYPE_OF_PRODUCER, 1, sep = ", "),
  energy_company = word(TYPE_OF_PRODUCER, 2, sep = ", ")) %>%
  select(-TYPE_OF_PRODUCER)
```

```{r}
# Convert generation data into terawatt hours and categorize into renewable and non-renewable
generation <- producer_split %>% mutate(
  type_of_energy_source = ifelse(
    ENERGY_SOURCE %in% c("Coal", "Natural Gas", "Petroleum",
                         "Other Gases", "Nuclear", "Other"),
    "Non_Renewable", "Renewable"),
  generation_TWh = GENERATION_Megawatthours/1000000)

```

```{r}
# A brief view the final generated dataset
head(generation)

```

```{r}
summary(generation)

```

```{r}
# Generate a summary of generation by energy source
generation %>% group_by(ENERGY_SOURCE) %>%
  summarize(
    Min = min(generation_TWh),
    Mean_gen = mean(generation_TWh),
    `1st_quantile` = quantile(generation_TWh, probs = 0.25),
    Median = quantile(generation_TWh, probs = 0.50),
    `3rd_quantile` = quantile(generation_TWh, probs = 0.75),
    Max = max(generation_TWh))
```


### Data Analysis

1.  Temporal Trend


```{r}
# Trend of total energy generation over the years (excluding Pumped Storage)
generation %>% 
  group_by(YEAR) %>% 
  filter(YEAR != 2022, ENERGY_SOURCE != "Pumped Storage") %>%
  summarise(total_generation_TWh = sum(generation_TWh)) %>%
  ggplot(aes(x = YEAR, y = total_generation_TWh)) + 
  geom_line(color = "royalblue", size = 0.8) + 
  labs(title = "Annual Energy Generation Over Years",
       x = "Year", y = "Total Generation (TWh)") +
  theme_minimal() + theme(plot.title = element_text(hjust = 0.5))
```

```{r}
# Trend of renewable and non-renewable energy generation over the years (excluding Pumped Storage)
generation %>% 
  group_by(YEAR, type_of_energy_source) %>%
  filter(YEAR != 2022, ENERGY_SOURCE != "Pumped Storage") %>% 
  summarise(total_generation_TWh = sum(generation_TWh)) %>%
  ggplot(aes(x = YEAR, y = total_generation_TWh, color = type_of_energy_source)) + 
  geom_line(size = 0.8) +
  labs(title = "Annual Energy Generation Over Years by Energy Source",
       x = "Year", y = "Total Generation (TWh)", color = "Energy Source") +
  theme_minimal() + theme(plot.title = element_text(hjust = 0.5))
```

```{r}
# Total renewable energy generation by energy source (excluding Pumped Storage)
renewable <- generation %>% 
  filter(type_of_energy_source == "Renewable", YEAR != 2022, ENERGY_SOURCE != "Pumped Storage")
```


2.  Energy Type Comparison


```{r}
# Bar plot of total renewable energy generation by energy source
renewable %>% 
  group_by(ENERGY_SOURCE) %>% 
  summarise(total_generation = sum(generation_TWh)) %>% 
  arrange(desc(total_generation)) %>%
  ggplot(aes(x = reorder(ENERGY_SOURCE, total_generation), y = total_generation)) +
  geom_col(fill = "royalblue") + coord_flip() +
  geom_text(aes(label = scales::number(total_generation, accuracy = 0.01)),
            hjust = -0.1, vjust = 0, size = 3.5, color = "black") +
  labs(title = "Total Renewable Energy Generation by Energy Source",
       x = "Total Generation (TWh)", y = "Energy Source") + 
  theme_minimal() + theme(plot.title = element_text(hjust = 0.5))
```

```{r}
# Line plot of renewable energy generation over the years by energy source
color_palette <- c("blue1", "darkcyan", "cyan3", "bisque4", "brown", "cornflowerblue")
renewable %>%  
  group_by(YEAR, ENERGY_SOURCE) %>% 
  summarise(total_generation = sum(generation_TWh)) %>%
  ggplot(aes(x = YEAR, y = total_generation, color = ENERGY_SOURCE)) +
  geom_line(size = 0.5) +
  scale_color_manual(values = color_palette) +
  labs(title = "Renewable Energy Generation Over Years by Energy Source",
       x = "Year", y = "Total Generation (TWh)", color = "Energy Source") +
  theme_minimal() + theme(plot.title = element_text(hjust = 0.5))

```


3.  Producer Sector Analysis


```{r}
# Total renewable energy generation by energy producer
renewable %>% group_by(energy_producer) %>% 
  summarise(total_generation = sum(generation_TWh)) %>% 
  ggplot(aes(x = reorder(energy_producer, -total_generation), y = total_generation)) +
  geom_col(fill = "royalblue") +
  geom_text(aes(label = scales::number(total_generation, accuracy = 0.1)),
            hjust = 0.5, vjust = -0.3, size = 3.5, color = "black") +
  labs(title = "Total Renewable Energy Generation by Energy Producer",
       x = "Energy Producer", y = "Total Generation (TWh)") +
  theme_minimal() + theme(plot.title = element_text(hjust = 0.5))
```

```{r}
# Trend of total renewable energy generation by energy producer over the years
renewable %>% group_by(YEAR, energy_producer) %>% 
  summarise(total_generation = sum(generation_TWh)) %>% 
  ggplot(aes(x = YEAR, y = total_generation, color = energy_producer)) + 
  geom_line() +
  labs(title = "Total Renewable Energy Generation by Energy Producer Over Years",
       x = "Year", y = "Total Generation (TWh)", color = "Energy Producer") +
  theme_minimal() + theme(plot.title = element_text(hjust = 0.5))
```

```{r}
# Total renewable energy generation by energy company for electric generators
renewable %>% group_by(YEAR, energy_company) %>% 
  filter(energy_producer == "Electric Generators") %>%
  summarise(total_generation = sum(generation_TWh)) %>% 
  ggplot(aes(x = YEAR, y = total_generation, color = energy_company)) + 
  geom_line() +
  labs(title = "Total Renewable Energy Generation by Energy Company (Electric Generators) Over Years",
       x = "Year", y = "Total Generation (TWh)", color = "Energy Company") +
  theme_minimal()

```


4.  State-wise Analysis


```{r}
# Top 5 states by renewable energy generation
renewable %>% group_by(State) %>% 
  summarise(total_generation = sum(generation_TWh)) %>% 
  arrange(desc(total_generation)) %>% top_n(5) %>%
  ggplot(aes(x = reorder(State, -total_generation), y = total_generation)) + 
  geom_col(fill = "royalblue") +
  labs(title = "Top 5 States by Renewable Energy Generation",
       x = "State", y = "Total Generation (TWh)") + 
  geom_text(aes(label = scales::number(total_generation, accuracy = 0.1)),
            hjust = 0.5, vjust = -0.5, size = 3.5, color = "black") +
  theme_minimal() + theme(plot.title = element_text(hjust = 0.5))
```

```{r}
# Trend of renewable energy generation over the years by state
renewable_summary <- renewable %>% group_by(YEAR, State) %>% 
  summarise(total_generation = sum(generation_TWh)) %>% 
  arrange(desc(total_generation)) %>% 
  filter(State %in% c("Washington", "Texas", "Oregon", "New York", 
                      "California"))

color_palette3 <- c("blue3", "darkcyan", "cyan2", "darkorchid","darkred")
ggplot(renewable_summary, aes(x = YEAR, y = total_generation, color = State)) +
  geom_line() +
  scale_color_manual(values = color_palette3) +
  labs(title = "Renewable Energy Generation Over the Years",
       x = "Year", y = "Total Generation (TWh)") +
  theme_minimal() + theme(plot.title = element_text(hjust = 0.5))
```

```{r}
# Top 5 renewable energy generation by state and source
color_palette2 <- c("blue1", "darkcyan", "cyan3", "chocolate", "brown2", "cornflowerblue")
renewable %>% group_by(State, ENERGY_SOURCE) %>% 
  summarise(total_generation = sum(generation_TWh)) %>% 
  arrange(desc(total_generation)) %>% 
  filter(State %in% c("Washington", "Texas", "Oregon", "New York", 
                      "California")) %>%
  ggplot(aes(x = reorder(State, -total_generation), y = total_generation, fill = ENERGY_SOURCE)) +
  geom_bar(stat = "identity") +
  scale_fill_manual(values = color_palette2) +
  labs(title = "Top 5 Renewable Energy Generation by State and Source",
       x = "State", y = "Total Generation (TWh)",
       fill = "Energy Source") +
  theme_minimal() + theme(plot.title = element_text(hjust = 0.5))
```