# Setup Notebook

Load libraries

In [None]:
library(ggplot2)
library(plotly)
library(ggthemes)

## Load Data

Load in metabolomic dataset for papue new gunie subjects from the infants metabolomic dataset. 

In [None]:
if( !'png_metabolomics.725331.csv' %in% list.files() ){
    print("Downloading Data")
    download.file(url="https://storage.googleapis.com/bakar-data/10k/png_metabolomics.725331.csv", destfile="png_metabolomics.725331.csv", method="wget")
    print("Done")
}

In [None]:
guinea_mass_spec = read.csv( "png_metabolomics.725331.csv",  stringsAsFactors=FALSE, header=TRUE, sep="," )

# Basic Data Analysis

## Showing Data

In [None]:
dim(guinea_mass_spec)

In [None]:
guinea_mass_spec[1:10, 1:10]

Basic data info

In [None]:
guinea_mass_spec$SAMPLE.NAME

In [None]:
guinea_mass_spec$DAY

In [None]:
guinea_mass_spec$TREATMENT

In [None]:
guinea_mass_spec$SEX

Print column names

In [None]:
colnames( guinea_mass_spec )

Looks like the data **semi-normalized**. The mean is 0, but the SD is not 1

In [None]:
print("mean and sd for some row")
print( mean( guinea_mass_spec[,9] ) )
print( sd( guinea_mass_spec[,9] ) )

print("mean and sd for some other row")
print( mean( guinea_mass_spec[,15] ) )
print( sd( guinea_mass_spec[,15] ) )

In [None]:
#summary(newGuinea_massSpec)

## Checking Vaccinations <font color="green">Exciting :)</font>

Lets see how many people are <font color="blue">Vaccinated</font>

In [None]:
guinea_mass_spec$TREATMENT

Below we see that about half of the group had delayed vaccinations

In [None]:
sum( guinea_mass_spec$TREATMENT == 'VaccinatedGroup' ) / length(guinea_mass_spec$TREATMENT)

In [None]:
guinea_mass_spec[1:10, 7:17]

# <font color="orange">Test: Plot data for 1 Column</font>

<font color="red">Note: Currently I don't know how to handle the column names</font>

As a first test we'll try to print the data for some given column name save as the variable `analyte`. This simulates how the 10k website will create the plot.

Given some variable called anaylyte we find it's corresponding column

In [None]:
mass_spec_analyte = "cholesterol"
col_num = which( colnames( guinea_mass_spec ) == mass_spec_analyte )[1]
col_num

In [None]:
to_graph= data.frame( vals=  guinea_mass_spec[,col_num] , groups= as.factor(  guinea_mass_spec$DAY )  ) 
days =  guinea_mass_spec$DAY + rnorm( dim(guinea_mass_spec)[1] , mean = 0, sd = .15 ) 
to_graph$days = days

In [None]:
p = ggplot( to_graph, aes(x=days, y=vals, color=groups) ) + 
  geom_point() +
  labs(x='Days Alive', y="Molecule Expression", title=paste(mass_spec_analyte," Expression in Newborns"), fill = NULL, colour = NULL ) +
  theme_gdocs()    

In [None]:
print(p)

## <font color="gray">Gambia RNA example code</font>

```python
        num = which(gambia_rna$hgnc==analyte() )
        gene_name = gambia_rna[num,2]
        a_row = gambia_rna[num,3:dim(gambia_rna)[2]]
        
        # Put data in data frame
        cols = list()
        to_graph= data.frame( counts = as.numeric(a_row) )
        
        # groups data by age 
        groups = c()
        for( i in names(a_row) ){
          groups= c( groups, substr(i, start=nchar(i)-1, stop=nchar(i) )) 
        }
        to_graph$groups = as.factor( groups )
        
        # create X variable for scatter
        `%+=%` = function(e1,e2) eval.parent(substitute(e1 <- e1 + e2))
        to_graph$X = rnorm( length(a_row) , mean = 0, sd = .15 )
        to_graph$X[ grep("*D0", names(a_row))  ] %+=% 0
        to_graph$X[ grep("*D1", names(a_row))  ] %+=% 1
        to_graph$X[ grep("*D3", names(a_row))  ] %+=% 3
        to_graph$X[ grep("*D7", names(a_row))  ] %+=% 7
        
        p = ggplot( to_graph, aes(x=X, y=counts, color=groups) ) + 
          geom_point() +
          labs(x='Days Alive', y="Gene Expression (counts)", title=paste(gene_name," Expression in Newborns"), fill = NULL, colour = NULL ) +
          theme_gdocs()    
```

# <font color="brown">Scratch Paper</font>