# Text Mining

---

## Address and Name Matching

### Identifying address and it's components


In [1]:
import re
import pyparsing as pp

In [2]:
from pyparsing import *

# define number as a set of words
units = oneOf("Zero One Two Three Four Five Six Seven Eight Nine Ten"
          "Eleven Twelve Thirteen Fourteen Fifteen Sixteen Seventeen Eighteen Nineteen",
          caseless=True)
tens = oneOf("Ten Twenty Thirty Forty Fourty Fifty Sixty Seventy Eighty Ninety",caseless=True)
hundred = CaselessLiteral("Hundred")
thousand = CaselessLiteral("Thousand")
OPT_DASH = Optional("-")
numberword = ((( units + OPT_DASH + Optional(thousand) + OPT_DASH + 
                  Optional(units + OPT_DASH + hundred) + OPT_DASH + 
                  Optional(tens)) ^ tens ) 
               + OPT_DASH + Optional(units) )

# number can be any of the forms 123, 21B, 222-A or 23 1/2
housenumber = originalTextFor( numberword | Combine(Word(nums) + 
                    Optional(OPT_DASH + oneOf(list(alphas))+FollowedBy(White()))) + 
                    Optional(OPT_DASH + "1/2")
                    )
numberSuffix = oneOf("st th nd rd").setName("numberSuffix")
streetnumber = originalTextFor( Word(nums) + 
                 Optional(OPT_DASH + "1/2") +
                 Optional(numberSuffix) )

# just a basic word of alpha characters, Maple, Main, etc.
name = ~numberSuffix + Word(alphas)

# types of streets - extend as desired
type_ = Combine( MatchFirst(map(Keyword,"Street St Boulevard Blvd Lane Ln Road Rd Avenue Ave "
                        "Circle Cir Cove Cv Drive Dr Parkway Pkwy Court Ct Square Sq"
                        "Loop Lp Sector".split())) + Optional(".").suppress())

# street name 
nsew = Combine(oneOf("N S E W North South East West NW NE SW SE") + Optional("."))
streetName = (Combine( Optional(nsew) + streetnumber + 
                        Optional("1/2") + 
                        Optional(numberSuffix), joinString=" ", adjacent=False )
                ^ Combine(~numberSuffix + OneOrMore(~type_ + Combine(Word(alphas) + Optional("."))), joinString=" ", adjacent=False) 
                ^ Combine("Avenue" + Word(alphas), joinString=" ", adjacent=False)).setName("streetName")

# PO Box handling
acronym = lambda s : Regex(r"\.?\s*".join(s)+r"\.?")
poBoxRef = ((acronym("PO") | acronym("APO") | acronym("AFP")) + 
             Optional(CaselessLiteral("BOX"))) + Word(alphanums)("boxnumber")

# basic street address
streetReference = streetName.setResultsName("name") + Optional(type_).setResultsName("type")
direct = housenumber.setResultsName("number") + streetReference
intersection = ( streetReference.setResultsName("crossStreet") + 
                 ( '@' | Keyword("and",caseless=True)) +
                 streetReference.setResultsName("street") )
streetAddress = ( poBoxRef("street")
                  ^ direct.setResultsName("street")
                  ^ streetReference.setResultsName("street")
                  ^ intersection )

tests = """\
    3120 De la Cruz Boulevard
    100 South Street
    14 Main Road
    23 Sector
    One Union Square, Apt 22-C
    """.split("\n")

# how to add Apt, Suite, etc.
suiteRef = (
            oneOf("Suite Ste Apt Apartment Room Rm #", caseless=True) + 
            Optional(".") + 
            Word(alphanums+'-')("suitenumber"))
streetAddress = streetAddress + Optional(Suppress(',') + suiteRef("suite"))

for t in map(str.strip,tests):
    if t:
        #~ print "1234567890"*3
        print(t)
        addr = streetAddress.parseString(t, parseAll=True)
        #~ # use this version for testing
        #~ addr = streetAddress.parseString(t)
        print("Number:", addr.street.number)
        print("Street:", addr.street.name)
        print("Type:", addr.street.type)
        if addr.street.boxnumber:
            print("Box:", addr.street.boxnumber)
        print(addr.dump())
        print()

3120 De la Cruz Boulevard
('Number:', '3120 ')
('Street:', 'De la Cruz')
('Type:', 'Boulevard')
['3120 ', 'De la Cruz', 'Boulevard']
- name: De la Cruz
- number: 3120 
- street: ['3120 ', 'De la Cruz', 'Boulevard']
  - name: De la Cruz
  - number: 3120 
  - type: Boulevard
- type: Boulevard
()
100 South Street
('Number:', '100 ')
('Street:', 'South')
('Type:', 'Street')
['100 ', 'South', 'Street']
- name: South
- number: 100 
- street: ['100 ', 'South', 'Street']
  - name: South
  - number: 100 
  - type: Street
- type: Street
()
14 Main Road
('Number:', '14 ')
('Street:', 'Main')
('Type:', 'Road')
['14 ', 'Main', 'Road']
- name: Main
- number: 14 
- street: ['14 ', 'Main', 'Road']
  - name: Main
  - number: 14 
  - type: Road
- type: Road
()
23 Sector
('Number:', '')
('Street:', '23 ')
('Type:', 'Sector')
['23 ', 'Sector']
- name: 23 
- street: ['23 ', 'Sector']
  - name: 23 
  - type: Sector
- type: Sector
()
One Union Square, Apt 22-C
('Number:', 'One ')
('Street:', 'Union')
('Type:

In [74]:
import nameparser

In [95]:
name1 = "Sunit Prasad"
name2 = "Dr. Bruce Banter"
name3 = "Sumeet Bansal"

nameparser.HumanName(name3)

<HumanName : [
	title: '' 
	first: 'Sumeet' 
	middle: '' 
	last: 'Bansal' 
	suffix: ''
	nickname: ''
]>

In [98]:
coname = "M/s Alabs Pvt Ltd"
co_pattern = re.compile(r"^[Ms|M/S]*[a-zA-Z0-9][a-zA-Z0-9\.\-#&\s]*$")

m = re.match(co_pattern,coname)
if(m):
    print(m.group(0))

M/s Alabs Pvt Ltd
