In [None]:
install.packages("titanic")

In [None]:
library(ggplot2)
library(titanic)

In [None]:
str(mpg)

In [None]:
head(mpg)

Murder: Murder arrests (per 100,000)

Assault: Assault arrests (per 100,000)

UrbanPop: Percent urban population

Rape: Rape arrests (per 100,000)

In [None]:
USArrests

In [None]:
?USArrests

In [None]:
head(titanic_train)


In [None]:
head(titanic_train)

# Visaulizing Quantitites
## Bar Plots

In [None]:
ggplot(data=mpg, aes(x=manufacturer)) +
  geom_bar(width=0.7, fill="steelblue")+
  theme_minimal()

In [None]:
ggplot(data=mpg, aes(x=manufacturer)) +
  geom_bar(width=0.7, fill="steelblue")+
  theme_minimal() + coord_flip()

## Stacked Bar Chart
The black outline is ugly and unnecessary, but just displaying that option

In [None]:
ggplot(data=mpg, aes(x=manufacturer,fill=class)) +
  geom_bar(width=0.7, color="black")+
  theme_minimal() + coord_flip()

## Grouped Bar Chart
Use position_dodge()

In [None]:
ggplot(data=mpg, aes(x=manufacturer,fill=class)) +
  geom_bar(width=0.7, color="black", position=position_dodge())+
  theme_minimal() + coord_flip() 

## Change the Fill Colors and Adding Counts
Using RColorBrewer
https://www.nceas.ucsb.edu/~frazier/RSpatialGuides/colorPaletteCheatsheet.pdf

In [None]:
ggplot(data=mpg, aes(x=manufacturer,fill=class)) +
  geom_bar(width=0.7)+ 
  geom_text(aes(label=..count..),stat="count",position=position_stack(0.5)) +
  theme_minimal() + 
  coord_flip() + 
  scale_fill_brewer(palette="Set3") 


# Faceting
Fata can be split up by one or two variables that vary on the horizontal and/or vertical direcdtion.

This is done by giving a formula to facet_grid(), of the form vertical ~ horizontal.

In [None]:
ggplot(data=mpg, aes(x=manufacturer)) +
  geom_bar(width=0.7)+ coord_flip() + scale_fill_brewer(palette="Set3") + facet_grid(. ~ class)

In [None]:
ggplot(data=mpg, aes(x=manufacturer)) +
  geom_bar(width=0.7)+ coord_flip() + scale_fill_brewer(palette="Set3") + facet_grid(cyl ~ class)

## Sorting by Frequency
```
forcats::fct_infreq(category)
```

In [None]:
ggplot(data=mpg, aes(x=forcats::fct_infreq(manufacturer))) +
  geom_bar(width=0.7, fill="steelblue")+
  theme_minimal() + coord_flip()

## Clevand Dot Plot

In [None]:
ggplot(USArrests, aes(x=Murder,y=row.names(USArrests))) + geom_point()

In [None]:
ggplot(USArrests, aes(x=Murder,y=reorder(row.names(USArrests),Murder))) + geom_point()

In [None]:
ggplot(USArrests, aes(x=Murder,y=reorder(row.names(USArrests),Murder))) + 
      geom_point(size=2.5,color="steelblue") +
      xlab("Murders per 100,000 residents") +
      ylab("State") +
      ggtitle("Murders by State in 1973") +theme_minimal()

## Visualizing Distributions

In [None]:
ggplot(data = titanic_train) +
  geom_histogram(mapping = aes(x = Age), bins =20, fill="steelblue", color="white") +
  theme_minimal() 

In [None]:
ggplot(data = titanic_train) +
  geom_density(mapping = aes(x = Age), bins =20, fill="steelblue", color="white") +
  theme_minimal()

In [None]:
ggplot(data = titanic_train) +
  geom_freqpoly(mapping = aes(x = Age), bins =20)  +
  theme_minimal() + xlim(0, 85)

In [None]:
ggplot(data = titanic_train) +
 geom_area(aes(y = ..count.., fill = Age, x=Age), stat = "bin", bins=20, fill="steelblue") +
  theme_minimal() + xlim(0, 85)

In [None]:
ggplot(data = titanic_train) +
  geom_histogram(mapping = aes(x = Age, fill=Sex), bins =20) +
  theme_minimal() 

In [None]:
ggplot(data = titanic_train) +
 geom_area(aes(y = ..count.., fill=Sex, x=Age, color=Sex), stat = "bin", bins=20) +
  theme_minimal() + xlim(0, 85)

In [None]:
ggplot(titanic_train, aes(Age, fill = Sex)) +
  geom_density(alpha = 0.3)

In [None]:
ggplot(data = titanic_train, mapping = aes(x = Survived, fill = Sex)) +
    geom_bar() +
    geom_text(aes(label=..count..),stat="count",position=position_stack(0.5)) + theme_minimal()

In [None]:
ggplot(data=titanic_train,aes(x=Age,fill=Sex)) + 
  geom_bar(data=subset(titanic_train,Sex=="female"),stat = "bin", bins=25) + 
  geom_bar(data=subset(titanic_train,Sex=="male"),stat = "bin",bins=25,aes(y=..count..*(-1))) + 
  scale_y_continuous(breaks=seq(-40,40,10),labels=abs(seq(-40,40,10))) +
  coord_flip() + theme_minimal() 

## Cumulative Distributions

In [None]:
set.seed(1234)
df <- data.frame(height = round(rnorm(200, mean=60, sd=15)))
head(df)

In [None]:
ggplot(df, aes(height)) + stat_ecdf(geom = "point")
ggplot(df, aes(height)) + stat_ecdf(geom = "step")

In [None]:
ggplot(df, aes(height)) + 
   stat_ecdf(geom = "step") + 
   stat_ecdf(geom = "point", color="steelblue") +
   labs(title="Empirical Cumulative \n Density Function",
        y = "F(height)", x="Height in inch") + 
   expand_limits(x = 0, y = 0) +
   theme_classic()

In [None]:
library(datasets)

In [None]:
data(airquality)
airquality$Month <- factor(airquality$Month,
                           labels = c("May", "Jun", "Jul", "Aug", "Sep"))

In [None]:
ggplot(airquality, aes(x = Month, y = Ozone)) +
        geom_boxplot()

In [None]:
fill <- "#4271AE"
line <- "#1F3552"
p = ggplot(airquality, aes(x = Month, y = Ozone)) +
        geom_boxplot(fill = fill, color = line, alpha = 0.7,
                     outlier.color = "#1F3552", outlier.shape = 20) +
        scale_y_continuous(name = "Mean ozone in\nparts per billion",
                           breaks = seq(0, 175, 25),
                           limits=c(0, 175)) +
        scale_x_discrete(name = "Month") +
        ggtitle("Boxplot of mean ozone by month") +
        theme_bw()
p

In [None]:
p + geom_jitter()

In [None]:
fill <- "#4271AE"
line <- "#1F3552"
p = ggplot(airquality, aes(x = Month, y = Ozone)) +
        geom_violin(fill = fill, color = line, alpha = 0.7
                     ) +
        scale_y_continuous(name = "Mean ozone in\nparts per billion",
                           breaks = seq(0, 175, 25),
                           limits=c(0, 175)) +
        scale_x_discrete(name = "Month") +
        ggtitle("Violin Plot of mean ozone by month") +
        theme_bw()
p

In [None]:
## Scatterplot

In [None]:
ggplot(mpg, aes(x=displ, y=hwy, color=class)) + geom_point()

## Review Exercises
R contains a number of built-in datasets in addition to those installed by various packages.  You can view this list of available datasets by executing ```data()```.  Typing the dataset name by itself will shows you a preview of the dataframe. You can execute ```str(datasetName)``` and ```summary(datasetName)``` to get more information about the dataset.

Select a one or of these datasets and then create similar visualizations to those shown in this notebook.

As an example, the ```iris``` dataset is often used to demostrate relationships.  Create a scatter plot where the data is ```iris```, the x axis is ```Sepal.Length```, the y axis is ```Petal.Width```, and the color is ```Species```