# 03_FileManipulation

## Read in Fasta sequences

In [13]:
########
# Read in fasta DNA sequence file
# Discard header lines
# Keep only DNA sequence lines
#
# Compute the lengths of each DNA sequence
# Find min, max, and average sequence length
#
# Make sure "shortReads.fa" is in your current working directory
# or you define the path to the file "Desktop/shortReads.fa"
########

# open shortReads.fa for reading
fasta = open("shortReads.fa", "r")
# read lines from file into variable
lines = fasta.readlines()
# close file
fasta.close()

# list to store DNA sequences
sequences = []

# loop through each line to fine only DNA sequences
for line in lines:
    
    # strip newline character from each line
    entry = line.rstrip()
    
    # check for > at start of each entry
    if entry[0] == ">":
        # if true, skip entry
        continue
    else:
        # else add entry to sequence list
        sequences.append(entry)
# END OF FOR LOOP

# list to store sequence lengths
lengths = []

# loop through each sequence and compute the length
for sequence in sequences:
    
    # compute the length of each sequence, store in list
    lengths.append(len(sequence))
# END OF FOR LOOP

# find minimum length sequence within list of lengths
minLength = min(lengths)
# find maximum length sequence
maxLength = max(lengths)
# compute average length of sequences in list
avgLength = sum(lengths) / len(lengths)

# output the calculated values
print("Total Number of Sequences: {}".format(len(sequences)))
print("Average Sequence Length: {} bp".format(avgLength))
print("Maximum Sequence Length: {} bp".format(maxLength))
print("Minimum Sequence Length: {} bp".format(minLength))



Total Number of Sequences: 10
Average Sequence Length: 25.0 bp
Maximum Sequence Length: 25 bp
Minimum Sequence Length: 25 bp


## Dictionaries (Hash Tables)

list = [0, 1, 2, three]  

dictionary = {'key' : value}

In [4]:
# define a new dictionary
animals = {'dog':'woof', 'cat':'meow', 'cow':'moo'}
print(animals['cat']) #print the value by key "look-up"
animals #data stored in alphabetical order

meow


{'cat': 'meow', 'cow': 'moo', 'dog': 'woof'}

In [7]:
# print all keys in dictionary
print(animals.keys())
# print all values in dictionary
print(animals.values())

dict_keys(['dog', 'cat', 'cow'])
dict_values(['woof', 'meow', 'moo'])


In [8]:
# loop through a dictionary with a for loop
for animal in animals.keys():
    print(animals[animal])

woof
meow
moo


In [10]:
# dictionaries can be complex
# values of a dictionary can be a list
complexDict = {'numbers':[1,2,3], 'letters':['a', 'b', 'c']}
# add a number to the list at dictionary key 'numbers'
complexDict['numbers'].append(4)
# print out the values of the dictionary
print(complexDict.values()) #4 has been added to the 'numbers' list

dict_values([[1, 2, 3, 4], ['a', 'b', 'c']])


## File Manipulation with Dictionaries

Counting all the different bases in a fasta file

In [27]:
########
# Read in fasta DNA sequence file
# Discard header lines
# Keep only DNA sequence lines
#
# Compute the lengths of each DNA sequence
# Find min, max, and average sequence length
#
# Make sure "shortReads.fa" is in your current working directory
# or you define the path to the file "Desktop/shortReads.fa"
########

# open shortReads.fa for reading
fasta = open("shortReads.fa", "r")
# read lines from file into variable
lines = fasta.readlines()
# close file
fasta.close()

# list to store DNA sequences
sequences = []

# loop through each line to fine only DNA sequences
for line in lines:
    
    # strip newline character from each line
    entry = line.rstrip()
    
    # check for > at start of each entry
    if entry[0] == ">":
        # if true, skip entry
        continue
    else:
        # else add entry to sequence list
        sequences.append(entry)
# END OF FOR LOOP

# list to store sequence lengths
lengths = []

# loop through each sequence and compute the length
for sequence in sequences:
    
    # compute the length of each sequence, store in list
    lengths.append(len(sequence))
# END OF FOR LOOP

# find minimum length sequence within list of lengths
minLength = min(lengths)
# find maximum length sequence
maxLength = max(lengths)
# compute average length of sequences in list
avgLength = sum(lengths) / len(lengths)

# output the calculated values
print("Total Number of Sequences: {}".format(len(sequences)))
print("Average Sequence Length: {} bp".format(avgLength))
print("Maximum Sequence Length: {} bp".format(maxLength))
print("Minimum Sequence Length: {} bp".format(minLength))

#################################################
# New code starts here!
#################################################

# dictionary to hold counts for each base
bases = {'A':0, 'T':0, 'C':0, 'G':0}

# a loop to go through each sequence in the list 'sequences'
for sequence in sequences:

    # count the number of time 'A' appears in this sequence, add to dictionary value at key 'A'
    bases['A'] += sequence.count('A')
    # repeat for 'C', 'T', and 'G'
    bases['T'] += sequence.count('T')
    bases['C'] += sequence.count('C')
    bases['G'] += sequence.count('G')
    
# END OF FOR LOOP

print("Number of bases across all sequences:")
print(bases)

Total Number of Sequences: 10
Average Sequence Length: 25.0 bp
Maximum Sequence Length: 25 bp
Minimum Sequence Length: 25 bp
Number of bases across all sequences:
{'A': 63, 'T': 50, 'C': 66, 'G': 71}


In [23]:
#################################################
# Another way to update dictionary values
#################################################

# dictionary to hold counts for each base
bases = {'A':0, 'T':0, 'C':0, 'G':0}

# a loop to go through each sequence in the list 'sequences'
for sequence in sequences:

    # loop through the keys in the dictionary 'bases'
    for base in bases.keys():
        # access each key in the dictionary bases
        # use that key to count the bases in the sequence
        bases[base] += sequence.count(base)
    
# END OF FOR LOOP

print("Another way to count bases across all sequences:")
print(bases)

Another way to count bases across all sequences:
{'A': 63, 'T': 50, 'C': 66, 'G': 71}


In [24]:
# A nice way of reporting values to user
print("Number of {base}'s: {count}".format(base = 'A', count = bases['A']))

Number of A's: 63


In [25]:
# but we dont want to repeat this line for each base in the dictionary, lets use a loop
for base in bases.keys():
    print("Number of {base}'s: {count}".format(base = base, count = bases[base]))

Number of A's: 63
Number of T's: 50
Number of C's: 66
Number of G's: 71


In [26]:
# another way to write this (exactly the same as the above loop!)
for key in bases.keys():
    print("Number of {base}'s: {count}".format(base = key, count = bases[key]))

Number of A's: 63
Number of T's: 50
Number of C's: 66
Number of G's: 71


## Functions

input -> myFunction -> output

input aka parameters

In [29]:
# a really easy function to add two numbers
a = 2
b = 3

# start a function definition with the keywork 'def'
# name your function 'add'
# define the input parameters
# end define statement with a colon!
def add(first_variable, second_variable):
    # do some work inside the function, in our case add the two input parameters
    mySum = first_variable + second_variable
    # a special statement to return output to the user
    return(mySum)
# END OF FUNCTION DEFINITION

# use our function in a print statement
print(add(a, b))

5


Let's write a function to count the bases in a given sequence.

In [31]:
mySequence = 'AAATTTGGGCCC'

mySequence.count('A')
mySequence.count('T')
mySequence.count('G')
mySequence.count('C')

3

In [36]:
mySequence = 'AAATTTGGGCCC'

# using a dictionary
bases = {'A':0, 'T':0, 'G':0, 'C':0}

# loop through each key in dictionary
for key in bases.keys():
    bases[key] += mySequence.count(key)

# print each the dictionary
print(bases)

{'A': 3, 'T': 3, 'G': 3, 'C': 3}


In [40]:
# wrap everything up into a function!

def countBases(sequence):
    # using a dictionary
    bases = {'A':0, 'T':0, 'G':0, 'C':0}

    # loop through each key in dictionary
    for key in bases.keys():
        bases[key] += sequence.count(key)

    # return each the dictionary
    return(bases)
# END OF FUNCTION DEFINITION

In [43]:
# lets use the function!

sequenceCounts = countBases('AAAATTTTGGGGCCCC')

print( sequenceCounts )

{'A': 4, 'T': 4, 'G': 4, 'C': 4}
