Problem 1

In [89]:
def count_degrees(csv_file_name):
    """
    Returns dictionary containing counts of degrees held by all people in source file
    ----
    argument: filename/path of sourcefile
    return: dictionary of degrees and counts
    """
    from collections import defaultdict
    degrees_dict = defaultdict(int)
    def standardize_degree(text):
        """
        Returns standardized version of text for a degree passed to it.
        ----
        argument: string representing degree abbreviation
        returns: standarded string
        """
        import re
        if re.match("(?i)ph\.?d\.?",text):
            degree_text = 'Ph.D.'
        elif re.match("(?i)sc\.?d\.?",text):
            degree_text = 'Sc.D.'
        elif re.match("(?i)m\.?s\.?",text):
            degree_text = 'M.S.'
        elif re.match("(?i)m\.?a\.?",text):
            degree_text = 'M.A.'
        else:
            degree_text = text                
        return degree_text
    f = open(csv_file_name)
    firstrow = True
    for line in f:
        row = line.replace('\n','').split(',')
        if firstrow:
            firstrow = False
        else:
            for degree in row[1].strip().split(' '):
                degrees_dict[standardize_degree(degree)] += 1
    f.close()
    return(degrees_dict)

In [90]:
import pprint as pp
pp.pprint(count_degrees('faculty.csv'))

defaultdict(<class 'int'>,
            {'0': 1,
             'B.S.Ed.': 1,
             'JD': 1,
             'M.A.': 1,
             'M.S.': 2,
             'MD': 1,
             'MPH': 2,
             'Ph.D.': 31,
             'Sc.D.': 6})


In [91]:
count_degrees('faculty.csv')['Ph.D.']

31

Problem 2

In [98]:
def count_titles(csv_file_name):
    """
    Returns dictionary containing counts of titles held by people in 
    the source file
    ----
    argument: filename/path of the source file
    return: dictionary of titles and counts
    """
    from collections import defaultdict
    titles_dict = defaultdict(int)
    def standardize_title(text):
        """
        Returns standardized version of text for a title passed to it.
        ----
        argument: string representing title
        returns: standarded title string
        """
        degree_text = text.replace(' is ',' of ')                
        return degree_text
    f = open(csv_file_name)
    firstrow = True
    for line in f:
        row = line.replace('\n','').split(',')
        if firstrow:
            firstrow = False
        else:
            titles_dict[standardize_title(row[2])] += 1
    f.close()
    return(titles_dict)

In [99]:
import pprint as pp
pp.pprint(count_titles('faculty.csv'))

defaultdict(<class 'int'>,
            {'Assistant Professor of Biostatistics': 12,
             'Associate Professor of Biostatistics': 12,
             'Professor of Biostatistics': 13})


In [102]:
def emails(csv_file_name):
    """
    Returns the list email addresses in the source file
    ----
    argument: filename/path of the source file
    return: list of emails
    """
    f = open(csv_file_name)
    firstrow = True
    email_list = []
    for line in f:
        row = line.replace('\n','').split(',')
        if firstrow:
            firstrow = False
        else:
            email_list.append(row[3])
    f.close()
    return(email_list)

In [103]:
emails('faculty.csv')

['bellamys@mail.med.upenn.edu',
 'warren@upenn.edu',
 'bryanma@upenn.edu',
 'jinboche@upenn.edu',
 'sellenbe@upenn.edu',
 'jellenbe@mail.med.upenn.edu',
 'ruifeng@upenn.edu',
 'bcfrench@mail.med.upenn.edu',
 'pgimotty@upenn.edu',
 'wguo@mail.med.upenn.edu',
 'hsu9@mail.med.upenn.edu',
 'rhubb@mail.med.upenn.edu',
 'whwang@mail.med.upenn.edu',
 'mjoffe@mail.med.upenn.edu',
 'jrlandis@mail.med.upenn.edu',
 'liy3@email.chop.edu',
 'mingyao@mail.med.upenn.edu',
 'hongzhe@upenn.edu',
 'rlocalio@upenn.edu',
 'nanditam@mail.med.upenn.edu',
 'knashawn@mail.med.upenn.edu',
 'propert@mail.med.upenn.edu',
 'mputt@mail.med.upenn.edu',
 'sratclif@upenn.edu',
 'michross@upenn.edu',
 'jaroy@mail.med.upenn.edu',
 'msammel@cceb.med.upenn.edu',
 'shawp@upenn.edu',
 'rshi@mail.med.upenn.edu',
 'hshou@mail.med.upenn.edu',
 'jshults@mail.med.upenn.edu',
 'alisaste@mail.med.upenn.edu',
 'atroxel@mail.med.upenn.edu',
 'rxiao@mail.med.upenn.edu',
 'sxie@mail.med.upenn.edu',
 'dxie@upenn.edu',
 'weiyang@mail.m

Problem 4
First figure out the regex to grab the domain and test it:

In [127]:
import re
def extractEmails( text ):
    return re.search("@[\w.]+",text).group(0)[1:]

set([re.search("@[\w.]+",email).group(0)[1:] for email in emails('faculty.csv')])

{'cceb.med.upenn.edu', 'email.chop.edu', 'mail.med.upenn.edu', 'upenn.edu'}

In [129]:
def unique_domains(emails):
    """
    Returns a unique collection of email domains contained
    withing the emails list
    ----
    argument: a list of valid emails
    return: a set of unique domains in the list of emails
    """
    import re
    return set([re.search("@[\w.]+",email).group(0)[1:] for email in emails])

Problem 5

In [139]:
def write_to_csv(list_of_emails):
    """
    Writes the list of emails to a file named 'emails.csv'
    ----
    argument: list of emails
    """
    f = open('emails.csv','w')
    f.write('list_of_emails\n')
    for email in list_of_emails:
        f.write(email+'\n')
    f.close()
    return

In [140]:
write_to_csv(emails('faculty.csv'))

Problem 6

In [219]:
def get_dict():
    """
    Returns a dictionary from 'faculty.csv' with key = last name
    and value = collection (list) of the rest of the row for 
    each faculty with that last name.
    """
    faculty_dict = {}
    f = open('faculty.csv')
    firstrow = True
    for line in f:
        # Parse the comma delimited line of the file
        row = line.replace('\n','').split(',')
        if firstrow:
            # The first row contains headers
            firstrow = False
        else:
            # Take the last 'word' of the name as the lastname
            # This won't work for compound last names!
            lastname = row[0].split(' ')[-1]
            if lastname in faculty_dict.keys():
                # The lastname is already in the dictionary
                # so get the list of rows for that lastname
                listoflists = faculty_dict[lastname]
            else:
                # A new lastname for the dictionary
                # so start a new list
                listoflists = []
            # Append the new row to the list
            listoflists.append(list(row[1:]))
            # Update the dictionary
            faculty_dict.update({lastname: listoflists})
    f.close()
    return faculty_dict

In [221]:
get_dict()

{'Bellamy': [[' Sc.D.',
   'Associate Professor of Biostatistics',
   'bellamys@mail.med.upenn.edu']],
 'Bilker': [['Ph.D.', 'Professor of Biostatistics', 'warren@upenn.edu']],
 'Bryan': [[' PhD',
   'Assistant Professor of Biostatistics',
   'bryanma@upenn.edu']],
 'Chen': [[' Ph.D.',
   'Associate Professor of Biostatistics',
   'jinboche@upenn.edu']],
 'Ellenberg': [[' Ph.D.', 'Professor of Biostatistics', 'sellenbe@upenn.edu'],
  [' Ph.D.', 'Professor of Biostatistics', 'jellenbe@mail.med.upenn.edu']],
 'Feng': [[' Ph.D',
   'Assistant Professor of Biostatistics',
   'ruifeng@upenn.edu']],
 'French': [[' PhD',
   'Associate Professor of Biostatistics',
   'bcfrench@mail.med.upenn.edu']],
 'Gimotty': [[' Ph.D', 'Professor of Biostatistics', 'pgimotty@upenn.edu']],
 'Guo': [[' Ph.D', 'Professor of Biostatistics', 'wguo@mail.med.upenn.edu']],
 'Hsu': [[' Ph.D.',
   'Assistant Professor of Biostatistics',
   'hsu9@mail.med.upenn.edu']],
 'Hubbard': [[' PhD',
   'Associate Professor of 

Problem 7

In [224]:
def get_dict():
    """
    Returns a dictionary from 'faculty.csv' with key = tuple of name components
    and value = the rest of the row for that faculty person
    """
    faculty_dict = {}
    f = open('faculty.csv')
    firstrow = True
    for line in f:
        # Parse the comma delimited line of the file
        row = line.replace('\n','').split(',')
        if firstrow:
            # The first row contains headers
            firstrow = False
        else:
            # Split the name into tuple of words
            name = tuple(row[0].split(' '))
            # Update the dictionary
            faculty_dict.update({name: row[1:]})
    f.close()
    return faculty_dict

In [225]:
get_dict()

{('Scarlett', 'L.', 'Bellamy'): [' Sc.D.',
  'Associate Professor of Biostatistics',
  'bellamys@mail.med.upenn.edu'],
 ('Warren', 'B.', 'Bilker'): ['Ph.D.',
  'Professor of Biostatistics',
  'warren@upenn.edu'],
 ('Matthew', 'W', 'Bryan'): [' PhD',
  'Assistant Professor of Biostatistics',
  'bryanma@upenn.edu'],
 ('Jinbo', 'Chen'): [' Ph.D.',
  'Associate Professor of Biostatistics',
  'jinboche@upenn.edu'],
 ('Susan', 'S', 'Ellenberg'): [' Ph.D.',
  'Professor of Biostatistics',
  'sellenbe@upenn.edu'],
 ('Jonas', 'H.', 'Ellenberg'): [' Ph.D.',
  'Professor of Biostatistics',
  'jellenbe@mail.med.upenn.edu'],
 ('Rui', 'Feng'): [' Ph.D',
  'Assistant Professor of Biostatistics',
  'ruifeng@upenn.edu'],
 ('Benjamin', 'C.', 'French'): [' PhD',
  'Associate Professor of Biostatistics',
  'bcfrench@mail.med.upenn.edu'],
 ('Phyllis', 'A.', 'Gimotty'): [' Ph.D',
  'Professor of Biostatistics',
  'pgimotty@upenn.edu'],
 ('Wensheng', 'Guo'): [' Ph.D',
  'Professor of Biostatistics',
  'wguo@