# Cleanup address and group them by the city


We have an `address_list = ['Pohlgasse 3\n14256 Berlin']` which contains string with the full address `street`, `street-number`, `postcode` and `city`.

As we want to convert the string-address into our internal structure, which is a dict.

```python
internal_address = {'street': 'Pohlgasse', 'street_no': '3', 'postcode': '14256', 'city': 'Berlin'},
```

**Be careful**
- A street and city can contain spaces.
- Also the data quality is not a good one and contains leading and trailing spaces.


**Goal**: we want to group them by the city.
```python
result = {
    'Berlin': [
        {'street': 'Pohlgasse', 'street_no': '3', 'postcode': '14256', 'city': 'Berlin'},
        {'street': 'Alberto Franke Ring', 'street_no': '13A', 'postcode': '10256', 'city': 'Berlin'}
    ],
    'Mainz Kostheim': [
        # address
    ]
    # ...more cities and address grouped by cities
}
```

What you'll need to solve this: `list`, `object`, `loop`, `strings`.

**Your task**
- [x] Write the function `group_addresss_by_city`
- [x] Try make it run first.
- [x] Make sure it covers all edge cases.
  - [x] Streets and cities doesn't starts.
  - [x] Streets and cities doesn't end with spaces.
  - [x] Streets and cities doesn't contains multiple spaces.
  - [x] Streets and cities doesn't contains street-number or postcode.
- [x] Make use of divide and conquer.
- [ ] Refactor so it is readable and maintainable.
- [x] Avoid one-does-all-function.

**Bonus**:  For a more sophisticated solution you'll need knowledge about: `generator`
- [x] Try to keep your runtime O(n): So just one iteration. -> prevent to loop multiple times through the data while maintaining readability.
- [ ] Use only string operations to extract the data.


In [3]:
raw_addresses = [
  'Dirk Bauer Weg 9/8\n97995 Berlin ',
  'Sigmund Wulff Gasse 936\n27247  München',
  'Edelgard Ruppert Gasse 466\n45556  München ',
  ' Katarzyna Gehringer Platz  847\n43743 Berlin ',
  'Dippelstr. 785\n 25561 München',
  'Kambsweg  180\n75157  Berlin',
  'Schaafplatz 8\n07706 München',
  'Nancy  Beyer Gasse 7/0\n80939 Mainz  Kostheim',
  'Kai Uwe   Kitzmann  Platz 30\n97024   Mainz  Kostheim ',
  '     Eckbauergasse 4/0\n40115 Mainz Kostheim',
  'Schollweg 8\n01984 Aue',
  'Ernst-Dieter-Reinhardt-Straße 36\n11397 Mainz Kostheim',
  'Anatoli Geisel Ring 09\n12077 Aue',
  'Karzplatz 844\n20113 Aue',
]

In [4]:
expected_addresses = {
    'Berlin': [
        {'street': 'Dirk Bauer Weg', 'street_no': '9/8', 'postcode': '97995', 'city': 'Berlin'},
        {'street': 'Katarzyna Gehringer Platz', 'street_no': '847', 'postcode': '43743', 'city': 'Berlin'},
        {'street': 'Kambsweg', 'street_no': '180', 'postcode': '75157', 'city': 'Berlin'},
    ],
    'München': [
        {'street': 'Sigmund Wulff Gasse', 'street_no': '936', 'postcode': '27247', 'city': 'München'},
        {'street': 'Edelgard Ruppert Gasse', 'street_no': '466', 'postcode': '45556', 'city': 'München'},
        {'street': 'Dippelstr.', 'street_no': '785', 'postcode': '25561', 'city': 'München'},
        {'street': 'Schaafplatz', 'street_no': '8', 'postcode': '07706', 'city': 'München'},
    ],
    'Mainz Kostheim': [
        {'street': 'Nancy Beyer Gasse', 'street_no': '7/0', 'postcode': '80939', 'city': 'Mainz Kostheim'},
        {'street': 'Kai Uwe Kitzmann Platz', 'street_no': '30', 'postcode': '97024', 'city': 'Mainz Kostheim'},
        {'street': 'Eckbauergasse', 'street_no': '4/0', 'postcode': '40115', 'city': 'Mainz Kostheim'},
        {'street': 'Ernst-Dieter-Reinhardt-Straße', 'street_no': '36', 'postcode': '11397', 'city': 'Mainz Kostheim'},
    ],
    'Aue': [
        {'street': 'Schollweg', 'street_no': '8', 'postcode': '01984', 'city': 'Aue'},
        {'street': 'Anatoli Geisel Ring', 'street_no': '09', 'postcode': '12077', 'city': 'Aue'},
        {'street': 'Karzplatz', 'street_no': '844', 'postcode': '20113', 'city': 'Aue'},
    ],
}

In [1]:
import re

In [32]:
def all_addresses(addresses):    
    '''
    using the auxillary functions, parses the address lines by street, street number, post code, and city
    '''
    total_addresses = []
    for address_line in addresses:
        address_line.strip()
        address_line = re.sub(' +', ' ', address_line) # replace multiple spaces with one
        city = get_city(address_line)
        whole_street = get_whole_street(address_line)
        street = get_street(whole_street)
        street_no = get_street_no(whole_street)
        postcode = get_postcode(address_line)
        line_dict = {'street':street, 'street_no': street_no, 'postcode':postcode, 'city': city}
        total_addresses.append(line_dict)
    
    return total_addresses

def get_unique_cities(parsed_addresses):
    cities = set()
    for line in parsed_addresses:
        cities.add(line['city'])
    # get the unique cities
    return cities

def group_addresss_by_city(parsed_addresses, cities):
    city_dict = dict()
    for city in cities:
        city_list = [address for address in parsed_addresses if address['city'] == city]
        city_dict[city] = city_list
    return city_dict

def get_city(address_line):
    city = re.findall('\n.[0-9]+ (.*)',address_line)[0]
    return city.strip()
    #'\n[0-9]+ (.*)' does not work for 'Dippelstr. 785\n 25561 München', bc. of extra space between \n and postalcode

def get_whole_street(address_line):
    street = re.findall('(.*)\n|\.[0-9]+ ',address_line)
    return street[0].strip()
    
def get_street(whole_street):
    street = whole_street.rsplit(' ', 1)[0]
    return street.strip()

def get_street_no(whole_street):
    street_no = whole_street.rsplit(' ', 1)[1]
    return street_no

def get_postcode(address_line):
    postcode = re.findall('\n.([0-9]*)', address_line)[0]
    return postcode

# Call your code
parsed_addresses = all_addresses(raw_addresses)

unq_cities = get_unique_cities(parsed_addresses)

expected_addresses = group_addresss_by_city(parsed_addresses, unq_cities)
print(expected_addresses)

{'München': [{'street': 'Sigmund Wulff Gasse', 'street_no': '936', 'postcode': '7247', 'city': 'München'}, {'street': 'Edelgard Ruppert Gasse', 'street_no': '466', 'postcode': '5556', 'city': 'München'}, {'street': 'Dippelstr.', 'street_no': '785', 'postcode': '25561', 'city': 'München'}, {'street': 'Schaafplatz', 'street_no': '8', 'postcode': '7706', 'city': 'München'}], 'Aue': [{'street': 'Schollweg', 'street_no': '8', 'postcode': '1984', 'city': 'Aue'}, {'street': 'Anatoli Geisel Ring', 'street_no': '09', 'postcode': '2077', 'city': 'Aue'}, {'street': 'Karzplatz', 'street_no': '844', 'postcode': '0113', 'city': 'Aue'}], 'Mainz Kostheim': [{'street': 'Nancy Beyer Gasse', 'street_no': '7/0', 'postcode': '0939', 'city': 'Mainz Kostheim'}, {'street': 'Kai Uwe Kitzmann Platz', 'street_no': '30', 'postcode': '7024', 'city': 'Mainz Kostheim'}, {'street': 'Eckbauergasse', 'street_no': '4/0', 'postcode': '0115', 'city': 'Mainz Kostheim'}, {'street': 'Ernst-Dieter-Reinhardt-Straße', 'street_n

In [31]:
# Feel free to add more checks here 👇

# Check
assert result == expected_addresses, 'Oops something is wrong'
print('Yeahhh you made it!')

# The reason check is not passed is:
# the expected_addresses list has an additional comma ',' after the list item.
# the result of my function does not include that. Which makes sense since no comma is necessary after the last item?
# Also, the expected_addresses is not the same as the goal defined in the beginning.
# The function to group by addresses is not here yet. I will hopefully do it soon :)

AssertionError: Oops something is wrong