In [None]:
import pandas as pd
import xml.etree.ElementTree as ET
from urllib.request import urlopen

### How to get the URL:

- Perform a search on uniptot
- Click on the "Download" button, select "XML" as format, and select "Preview first 10"
- Copy that url and delete the part that reads "&limit=10"

In [None]:
# Change the URL to fit your request
url = "https://www.uniprot.org/uniprot/?query=*&format=xml&fil=organism:%22Homo%20sapiens%20(Human)%20[9606]%22%20AND%20reviewed:yes"

# Don't change the following if you don't know exactly what you are doing!
namespaces = {
    'd':'http://uniprot.org/uniprot',
}

In [None]:
# This might take a while... sit back and relax
tree = ET.parse(urlopen(url))

In [None]:
root = tree.getroot()

In [None]:
locations_list = list()

for protein in root.findall('d:entry', namespaces=namespaces):
    accession = protein.find('d:accession', namespaces=namespaces).text
    comments = protein.findall('d:comment[@type="subcellular location"]', namespaces=namespaces)
    
    for sub_cell_loc_comment in comments:
        subcellularLocations = sub_cell_loc_comment.findall('d:subcellularLocation', namespaces=namespaces)

        for subcellularLocation in subcellularLocations:
            location = subcellularLocation.find('d:location', namespaces=namespaces)
            location_text = location.text
            evidences = location.get('evidence')
            
            if evidences:
                for evidence in evidences.split(' '):
                    current_evidence = protein.find('d:evidence[@key="{}"]'.format(evidence), namespaces=namespaces)
                    locations_list.append([accession, current_evidence.get('type'), location_text])
            else:
                locations_list.append([accession, None, location_text])
    
locations_dataframe = pd.DataFrame(locations_list, columns=['accession', 'evidence', 'location']).drop_duplicates()

In [None]:
locations_dataframe.to_csv("locations.tsv", sep='\t')