In [4]:
import numpy as np
import pandas as pd

# Read in the sample and run metadata using the pandas function read_csv() into a variable called sampleruns
sampleruns = pd.read_csv('./ltee_sampleruns.csv')

In [5]:
# Check the number of rows and columns
print(sampleruns.shape)

(264, 11)


In [6]:
print(sampleruns.index)

RangeIndex(start=0, stop=264, step=1)


In [7]:
print(sampleruns.columns)

Index(['Population', 'Generation', 'Strain ID', 'Mutator', 'Reference',
       'Facility', 'Accession', 'Read Type', 'Read Length', 'Sequencing Depth',
       'Analysis Notes'],
      dtype='object')


In [8]:
# View the first few rows
print(sampleruns.head())

  Population  Generation Strain ID Mutator   Reference   Facility   Accession  \
0      Ara+1         500   REL768A    None  This study  Genoscope  SRR2584408   
1      Ara+1         500   REL768B    None  This study  Genoscope  SRR2584409   
2      Ara+1        1000   REL958A    None  This study  Genoscope  SRR2584410   
3      Ara+1        1000   REL958B    None  This study  Genoscope  SRR2584411   
4      Ara+1        1500  REL1062A    None  This study  Genoscope  SRR2584438   

  Read Type  Read Length  Sequencing Depth Analysis Notes  
0    single           36         70.241277            NaN  
1    single           36         41.136778            NaN  
2    single           36         33.226052            NaN  
3    single           36         34.589932            NaN  
4    single           36         40.342099            NaN  


In [9]:
# Look at the column data types and names
print(sampleruns.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Population        264 non-null    object 
 1   Generation        264 non-null    int64  
 2   Strain ID         264 non-null    object 
 3   Mutator           264 non-null    object 
 4   Reference         264 non-null    object 
 5   Facility          264 non-null    object 
 6   Accession         264 non-null    object 
 7   Read Type         264 non-null    object 
 8   Read Length       264 non-null    int64  
 9   Sequencing Depth  264 non-null    float64
 10  Analysis Notes    13 non-null     object 
dtypes: float64(1), int64(2), object(8)
memory usage: 22.8+ KB
None


In [10]:

# Extract the column ‘Strain ID’ from the sampleruns variable, using square brackets
strain_id = sampleruns['Strain ID']
print(strain_id)

0       REL768A
1       REL768B
2       REL958A
3       REL958B
4      REL1062A
         ...   
259    REL10448
260    REL11005
261    REL11006
262    REL11389
263    REL11390
Name: Strain ID, Length: 264, dtype: object


In [11]:
# Set the sampleruns index (row names) to ‘Strain ID’
sampleruns_ind = sampleruns.set_index('Strain ID')
print(sampleruns_ind.head())

          Population  Generation Mutator   Reference   Facility   Accession  \
Strain ID                                                                     
REL768A        Ara+1         500    None  This study  Genoscope  SRR2584408   
REL768B        Ara+1         500    None  This study  Genoscope  SRR2584409   
REL958A        Ara+1        1000    None  This study  Genoscope  SRR2584410   
REL958B        Ara+1        1000    None  This study  Genoscope  SRR2584411   
REL1062A       Ara+1        1500    None  This study  Genoscope  SRR2584438   

          Read Type  Read Length  Sequencing Depth Analysis Notes  
Strain ID                                                          
REL768A      single           36         70.241277            NaN  
REL768B      single           36         41.136778            NaN  
REL958A      single           36         33.226052            NaN  
REL958B      single           36         34.589932            NaN  
REL1062A     single           36      

In [12]:
# Use .loc to extract the row ‘REL768B’ and column ‘Accession’
rel768b_accession = sampleruns_ind.loc['REL768B','Accession']
print(rel768b_accession)

SRR2584409


In [13]:
# Use .loc again to subset the same row, but all columns
rel768b = sampleruns_ind.loc['REL768B', :]
print(rel768b)

Population               Ara+1
Generation                 500
Mutator                   None
Reference           This study
Facility             Genoscope
Accession           SRR2584409
Read Type               single
Read Length                 36
Sequencing Depth     41.136778
Analysis Notes             NaN
Name: REL768B, dtype: object


In [14]:
# Use .loc to subset rows ‘REL768A’ and ‘REL958A’ and columns ‘Read Type’ and ‘Read Length’
rels_type_and_length = sampleruns_ind.loc[['REL768A', 'REL958A'], ['Read Type','Read Length']]
print(rels_type_and_length)

          Read Type  Read Length
Strain ID                       
REL768A      single           36
REL958A      single           36


In [15]:
# Use .iloc to subset row 1, column 9
print(sampleruns_ind.iloc[0,8])


70.24127718360918


In [16]:
# Use .iloc to subset rows 2-4, column 9
print(sampleruns_ind.iloc[1:4, 8])

Strain ID
REL768B    41.136778
REL958A    33.226052
REL958B    34.589932
Name: Sequencing Depth, dtype: float64


In [22]:
# Run the following command and see which rows/column are subset:
# sampleruns.iloc[0:5, [True, False, True, True, False, False, False, False, False, False, False]]
print(sampleruns.iloc[0:5, [True, False, True, True, False, False, False, False, False, False, False]])


  Population Strain ID Mutator
0      Ara+1   REL768A    None
1      Ara+1   REL768B    None
2      Ara+1   REL958A    None
3      Ara+1   REL958B    None
4      Ara+1  REL1062A    None


In [23]:

# Calculate and report the following metrics to 2 decimal places on the sampleruns
# “Sequencing Depth” column:
# a. Mean
# b. Median
# d. Minimum
# e. Maximum
report = sampleruns['Sequencing Depth'].agg([np.max, np.min, np.mean, np.median])
print(report)

amax      1026.249751
amin        27.803535
mean       115.774514
median      79.629418
Name: Sequencing Depth, dtype: float64


In [24]:

# Read in the cell size dataset using the pandas function read_csv() with the appropriate delimiter into a variable called cellsize?
cellsize = pd.read_csv('./ltee_cell_size.tsv', sep='\t')

In [25]:
# Check column names of this data frame, we will want to convert the column names for easier merging with the sampleruns data frame.
print(cellsize.columns)

Index(['Gen', 'A-1', 'A-2', 'A-3', 'A-4', 'A-5', 'A-6', 'A+1', 'A+2', 'A+3',
       'A+4', 'A+5', 'A+6'],
      dtype='object')


In [26]:
# Convert the column names using this command:
# cellsize.columns = (['Generation'] + ['Ara-{}'.format(n) for n in range(1,7)] + ['Ara+{}'.format(n) for n in range(1,7)])
cellsize.columns = (['Generation'] + ['Ara-{}'.format(n) for n in range(1,7)] + ['Ara+{}'.format(n) for n in range(1,7)])
print(cellsize)

   Generation   Ara-1   Ara-2   Ara-3   Ara-4   Ara-5   Ara-6   Ara+1   Ara+2  \
0           0  0.3725  0.3725  0.3725  0.3725  0.3725  0.3725  0.3725  0.3725   
1         500  0.4850  0.5100  0.4500  0.5150  0.5300  0.4550  0.4800  0.4700   
2        1000  0.5150  0.6250  0.6250  0.5800  0.6300  0.5150  0.4950  0.5550   
3        2000  0.6000  0.6600  0.6300  0.7000  0.7050  0.5450  0.5800  0.6750   
4        5000  0.6650  0.9450  0.7000  0.7650  0.9150  0.6550  0.5950  0.8900   
5        8000  0.6700  0.9400  0.6950  0.8150  0.8700  0.6750  0.6950  0.8950   
6        9000  0.6850  0.9750  0.6850  0.8100  0.8800  0.6600  0.7350  0.9200   
7        9500  0.6700  0.9350  0.6850  0.7350  0.8500  0.6850  0.7200  0.9400   
8       10000  0.6700  0.8450  0.7000  0.7600  0.7750  0.6550  0.7050  1.1000   

    Ara+3   Ara+4   Ara+5   Ara+6  
0  0.3725  0.3725  0.3725  0.3725  
1  0.4700  0.4850  0.5250  0.4600  
2  0.5750  0.4950  0.5850  0.5850  
3  0.6450  0.6750  0.6750  0.6400  
4  0.8500

In [27]:
# Convert your cellsize dataframe from wide to long format and rename the columns with these commands:
# cellsize = pd.melt(cellsize, id_vars=cellsize.columns[0], value_vars=cellsize.columns[1:14].tolist())
# cellsize.columns=["Generation", "Population", "Cell Size"]
cellsize = pd.melt(cellsize, id_vars=cellsize.columns[0], value_vars=cellsize.columns[1:14].tolist())
cellsize.columns=["Generation", "Population", "Cell Size"]
print(cellsize)

     Generation Population  Cell Size
0             0      Ara-1     0.3725
1           500      Ara-1     0.4850
2          1000      Ara-1     0.5150
3          2000      Ara-1     0.6000
4          5000      Ara-1     0.6650
..          ...        ...        ...
103        5000      Ara+6     0.7350
104        8000      Ara+6     1.0800
105        9000      Ara+6     1.0200
106        9500      Ara+6     0.9650
107       10000      Ara+6     1.0150

[108 rows x 3 columns]


In [28]:
# Merge the cellsize data frame with the sampleruns data frame on the ‘Generation’ and ‘Population’ variables to a variable called ltee
