# <font color=red>Libraries</font>

In [2]:
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
import warnings as wng
import re
from tqdm import tqdm
from datetime import datetime, date, timedelta
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
from ast import literal_eval
from IPython.display import display, Markdown, clear_output

## <font color=red>Setting Environment</font>
# <font color=yellow>FLAG - remove comment #(s)</font>

In [3]:
# for ignoring warning
wng.filterwarnings('ignore')
# setting display size
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# <font color=red>Functions and Utilities</font>

## <font color=aqua>custom print function</font>

In [4]:
def newPrint(title, msg="", size=3, center=False):
    n = size
    if msg != "":
        title = title + ": "
    if center:
        return display(Markdown(n*'#' + ' <center> <font color=green>'+title+'</font>' +
                                ' <font color=red>'+'<b>'+'<em>'+str(msg).replace('\n','<br>')+'</em>'+'</b>'+'</font> </center>'))
    else:
        return display(Markdown(n*'#' + ' <font color=green>'+title+'</font>' +
                                ' <font color=red>'+'<b>'+'<em>'+str(msg).replace('\n','<br>')+'</em>'+'</b>'+'</font>'))

## <font color=aqua>Func for taking count of all occurences of specific word in all address database </font> 

In [5]:
def fetch_words(frame, col, all_words=True):
    out = []
    for s in frame[col]:
        s=str(s)
        for ss in s.split():
            if all_words:
                try:
                    out.append(ss.upper().strip())
                except:
                    continue

    word_corpus = Counter(out)
    return word_corpus

## <font color=aqua>Func to access all keys with given specifc value</font>

In [6]:
def get_key(val,my_dict):
    out=[] 
    for key, value in my_dict.items(): 
         if val == value: 
              out.append(key) 
    return out

## <font color=aqua>Func to insert space between alpha-numeral </font>

In [7]:
def insert_spaces(regex,str1):
    return ' '.join((re.sub(regex, ' \g<0> ', str1)).split())

##  <font color=aqua>Func to recognise dtype of word</font>

In [8]:
def ifnum(z):
    return "Numeric" if len(re.findall(r'\d',z))!= 0 else "NonNumeric"

## <font color=aqua>Func to fetch pincodes from address to a new column</font>

In [9]:
def extract_pin(mdf,col,dest,reg):# mdf-dataframe, col-address col, dest-pinode, reg- re.pattern for pincode
    mdf[dest] = mdf[col].str.extract(reg)
    mdf[col] = mdf[col].str.replace(reg,'',regex=True)

## <font color=aqua>Lists for removing and replacing char from addresses</font> 

In [10]:
rem_list = ["EXTN","EXT","EXTENSION","NR","NEAR","DEHLI","DELLI","DELHI","DILLI","DELHI","DELHGI","DELH I","DELH","DILHI","DIHLI"]
plot_list = ["PLOT NOS","NOF","FNO","HNO","H NO","HNOE","HNOM","FLAT"]
street_list = ['STREET','STREETNO','STREET NO','STNO','GLII','GLI',"GALI","GALI NO","GN","GNO","GALIN","GALINO","G-NO","GALO","GALO NO","NOM","GALLI","GALLI NO","GAALI","GAALI NO","BUND GALI","BAND GALI","BAND GALLI","NOG","NO-G"]
plot_dict = {plot: "HOUSE NO" for plot in plot_list}
street_dict = {street: "STREET" for street in street_list}
loc_list = ['NAGAR','COLONY']
FLOOR_list = ['FLOOR','FLR','FLOR','GF','FF','SF','TF','UGF','LGF','IF','IIF','IIIF','G F','F F','S F','T F','IV F','TH F','UG F','LG F','FL']
repl_dict0 = {
            "FEET":"FUTA","FOOTA":"FUTA","FEETA":"FUTA","FOOT":"FUTA","FOOTA ROAD":"FUTA","FOOTA RAOD":"FUTA","FOOT ROAD":"FUTA","FOOTA RODA":"FUTA"\
            ,"FOOTA ROD":"FUTA","FEET ROAD":"FUTA","FEET RAOD":"FUTA","FOOT RD":"FUTA","FEET RD":"FUTA","FOOTA RD":"FUTA","FEETA RD":"FUTA","FUT":"FUTA",\
            "FUT ROAD":"FUTA","FUT ROD":"FUTA","FUTA ROAD":"FUTA","FUTA RAOD":"FUTA","FIT ROAD":"FUTA","FITA ROAD":"FUTA","FITA RAOD":"FUTA","FITA RD":"FUTA"\
            ,"FUTA":"FUTA","PARAK":"PARK","PRAK":"PARK","PRK":"PARK","PARK":"PARK","GRDN":"GARDEN","GARDN":"GARDEN","GARDAN":"GARDEN","GRDEN":"GARDEN",\
            "GRDAN":"GARDEN","GERDEN":"GARDEN","GERDAN":"GARDEN","GDN":"GARDEN","GARDEN":"GARDEN","GADREN":"GARDEN","COLNY":"COLONY","COLINY":"COLONY",\
            "CLONY":"COLONY","CALONY":"COLONY","CLY":"COLONY","CLNY":"COLONY","CNY":"COLONY","COLONI":"COLONY","COLNIY":"COLONY","COLONIY":"COLONY",\
            "COLONIY":"COLONY","CALONEY":"COLONY","COLONE":"COLONY","COLONU":"COLONY","CLONEY":"COLONY","KOLONY":"COLONY","KALONI":"COLONY",\
            "COLONY":"COLONY","VIHR":"VIHAR","VHR":"VIHAR","VHAR":"VIHAR","VEHAR":"VIHAR","VIHAAR":"VIHAR","VAHAR":"VIHAR","VIHAR":"VIHAR",\
            "ENCLAV":"ENCLAVE","ENCALAVE":"ENCLAVE","ENCALAV ":"ENCLAVE","ENCALAV ":"ENCLAVE","INCLAVE":"ENCLAVE","ENCLAVE":"ENCLAVE",\
            "DILHAD":"DILSHAD","DELSHAD":"DILSHAD","DLSHAD":"DILSHAD","DILSHAD":"DILSHAD","BAG ":"BAGH","BAHG ":"BAGH","BAHGH":"BAGH",\
            "BHAG":"BAGH","BAAG ":"BAGH","BAAGH":"BAGH","BHAAG":"BAGH","BAGH":"BAGH","SHAHADARA":"SHAHDARA","SHADARA":"SHAHDARA","SADARA":"SHAHDARA",\
            "SHADRA":"SHAHDARA","SADRA":"SHAHDARA","SHAHDRA":"SHAHDARA"
            }
repl_dict = {
            'AP':'APARTMENTS','ADD':'ADDRESS','APT':'APARTMENTS','APPT':'APARTMENTS','ADMN':'ADMINISTRATION','APTT':'APARTMENTS','APART':'APARTMENTS'\
            ,'APARTMANT':'APARTMENTS','AHINSHAVATIKA':'AHINSA VATIKA','BL':'BLOCK','BLK':'BLOCK','BLOK':'BLOCK','BLOC':'BLOCK','BLCOK':'BLOCK','CLY':'COLONY',\
            'CGHS':'APARTMENTS','CHILL':'CHILLA','CICEK':'VIVEK','COMPLES':'COMPLEX','DASH':'RAVIDAS','DABAL':'DOUBLE','DALLUPRA':'DALLUPURA',\
            'DHARMSHAL':'DHARMSHALA','DHARAMSALA':'DHARMSHALA','DHARMASHALA':'DHARMSHALA','EXT':'EXTENSION','ENV':'ENCLAVE','ECV':'ENCLAVE','ENC':'ENCLAVE'\
            ,'EXTN':'EXTENSION','ENCV':'ENCLAVE','ENCL':'ENCLAVE','EXTEN':'EXTENSION','EXTENED':'EXTENDED','ENCALVE':'ENCLAVE','FL':'FLOOR','FR':'FIRST',\
            'FT':'FOOTA','FLR':'FLOOR','FLO':'FLOOR','FNO':'FLAT NO','FLT':'FLAT','FLOR':'FLOOR','FALT':'FLAT','FLOO':'FLOOR','FLOT':'FLAT','FLOAR':'FLOOR',\
            'FUTTA':'FOOTA','FLORE':'FLOOR','FLATE':'FLAT','FLATNO':'FLAT NO','GD':'GHAROLI DAIRY','GF':'GROUND FLOOR','GRD':'GROUND','GOP':'GOPAL',\
            'GOVT':'GOVERNMENT','GHRO':'GHAROLI','GORAK':'GORAKH','GALINO':'GALI NO','GHALORI':'GHAROLI','GITANJALI':'GEETANJALI','HNO':'HOUSE NO',\
            'HIMALYA':'HIMALAYA','HARGOBIND':'HARGOVIND','IP':'INDRAPRASTHA','JAMUNA':'YAMUNA','KH':'KHASRA','KS':'KHASRA','KN':'KHASRA','KHNO':'KHASRA','KONDLY':'KONDLI',\
            'LG':'LOWERGROUND','LL':'LOWERGROUND','LTD':'LIMITED','LIF':'LIG','LOOR':'FLOOR','MAYU':'MAYUR','MAND':'MANDIR','MANDER':'MANDIR','MAHABIR':'MAHAVIR'\
            ,'MANWABAN':'MANBHAWAN','MULTISTORE':'MULTI STOREY','MANSAROVER':'MANSAROVAR','NEWNO':'NEW NO','NUBMER':'NUMBER','NALAPAAR':'NALA PAAR',\
            'NAGARUJANA':'NAGARJUNA','OPP':'OPPOSITE','OTH':'OTHER','OLDNO':'OLD NO','OPPSITE':'OPPOSITE','PH':'PHASE','PKT':'POCKET','PNO':'PLOT NO','PCK':'POCKET','PVT':'PRIVATE','PROP':'PROPERTY','POKT':'POCKET','POCK':'POCKET','PLOY':'PLOT','POLT':'PLOT','PLOR':'PLOT','PRAK':'PARK','PKTA':'POCKET A','PLOAT':'PLOT','PCKET':'POCKET','PODHO':'PAUDHE','PLAOT':'PLOT','PLOTNO':'PLOT NO','PUADHE':'PAUDHE','PARVANA':'PARWANA','PANCHEEL':'PANCHSHEEL','PANSHEEL':'PANCHSHEEL','PRIYADARS':'PRIYADARSHINI','QTR':'QUARTERS','QRTS':'QUARTERS','QARTR':'QUARTERS','QUARTERS':'QUARTERS','RAJBLOC':'RAJ BLOCK','SEC':'SECTION','SAUD':'SAUDAGAR','SITA':'SITARAM','STNO':'STREET NO','SHADRA':'SHAHDARA','SRIRAM':'SHRIRAM','SHAHDRA':'SHAHDARA','SHEKHAR':'SHEKHER','SOCITIES':'SOCIETY','SADBHAWNA':'SADHBHAWNA','SADHBHAVNA':'SADHBHAWNA','SHIVMANDIR':'SHIV MANDIR','SHREEGANESH':'SHRIGANESH','SHALIMARPARK':'SHALIMAR PARK','TF':'THIRD FLOOR','TYP':'TYPE','TYPR':'TYPE','TRIVANI':'TRIVENI','TAKSHILA':'TAKSHSHILA','TECHNOLOG':'TECHNOLOGY','UG':'UPPERGROUND','UGF':'UPPERGROUND FLOOR','UPR':'UPPER','UPPAR':'UPPER','UCHEPAR':'UNCHEPAR','UTTRANCHA':'UTTRANCHAL','YUDHISTER':'YUDHISHTHIR','YUDISHTER':'YUDHISHTHIR','YUDHESTAR':'YUDHISHTHIR','YUDHISHTIR':'YUDHISHTHIR','YUDHISTHIR':'YUDHISHTHIR','C G H S':'APARTMENTS','GAUTAM SSHB GAMIL COM':' ','SHELLYKHATHURIA1994 GMAIL COM':' ','IGLDELHI GMAIL COM':' ','RATURI PRADEEP26 GMAIL COM':' ','I P':'INDRAPRASTHA','I P EXTENSION':'INDRAPRASTHA EXTENSION','PARK END':'PARKEND','EAST END':'EASTEND','VIGYAN LOK':'VIGYANLOK','ANAND LOK':'ANANDLOK','SANCHAR LOK':'SANCHARLOK','VIGYAPAN LOK':'VIGYAPANLOK','NAV JAGRITI':'NAVJAGRITI','NAV RACHNA':'NAVRACHNA','NAV BHARAT':'NAVBHARAT','SHANKAR PUR':'SHANKARPUR','TAHIR PUR':'TAHIRPUR','HASAN PUR':'HASANPUR','SEELAM PUR':'SEELAMPUR','SHIV PURI':'SHIVPURI','GURGA PURI':'DURGAPURI','TRILOK PURI':'TRILOKPURI','TIRLOK PURI':'TRILOKPURI','JAGAT PURI':'JAGATPURI','DWARIKA PURI':'DWARKAPURI','RIVER VIEW':'RIVERVIEW','LAKE VIEW':'LAKEVIEW','PARK VIEW':'PARKVIEW','SRI RAM':'SHRIRAM','SRI KRISHNA':'SHRIKRISHNA','SRI GURU':'SHRIGURU'
            }
repl_dict2 = {'APARTMENT':'APARTMENTS','BAZAR':'BAZAAR','RAMDAS':'RAMDASS','UNCHE':'UNCHEPAR','YOJANA':'YOJNA','SRESHTHA':'SHRESHTHA','DEPT':'DEPARTMENT','ENGINEERS':'ENGINEER','BARI':'BADI','RAVIDAS':'RAVIDASS','AGARSEN':'AGRASEN','MIX':'MIXED','NAVNITI':'NAVNEETI','GOURAV':'GAURAV','NOA':'NO','IGESI':'ESI','SIDHARTH':'SIDDHARTH','PARSHVA':'PARSHVANATH','MANAVASATHALI':'MANAVSTHALI','ASHISHWANG':'ASHISHWONG','KALLOL':'KALOL','TEACHER':'TEACHERS','VIKALPA':'VIKALP','TRILOK':'TRILOKYA','GRD':'GROUND','COMPUTER':'COMPUTERS','SISHU':'SHISHU','POLT':'PLOT','ASSOCIATED':'ASSOCIATE','SHEKHER':'SHEKHAR','BLK':'BLOCK','SHAHDRA':'SHAHDARA'}
repl_dict3 = {'F/F':'FIRST','S/F':'SECOND','T/F':'THIRD','IST':'FIRST','3 ND':'THIRD','II ND':'SECOND','U G':'UPPERGROUND','GRD':'GROUND','BLOCK NO':'BLOCK','8 TH':'EIGHTH','11 TH':'ELEVENTH','15 TH':'FIFTEENTH','5 TH':'FIFTH','VF':'FIFTH FLOOR','V F':'FIFTH FLOOR','1 ST':'FIRST','IF':'FIRST FLOOR','I F':'FIRST FLOOR','FF':'FIRST FLOOR','F F':'FIRST FLOOR','FL':'FLOOR','FLR':'FLOOR','FLOR':'FLOOR','48 TH':'FORTYEIGHTH','14 TH':'FOURTEENTH','4 TH':'FOURTH','IVF':'FOURTH FLOOR','IV F':'FOURTH FLOOR','GF':'GROUND FLOOR','G F':'GROUND FLOOR','FALT':'HOUSE NO','FLT':'HOUSE NO','FLOT':'HOUSE NO','COMPOUNDNO':'HOUSE NO','FNO':'HOUSE NO','FLATNO':'HOUSE NO','HNO':'HOUSE NO','ROOMNO':'HOUSE NO','FL NO':'HOUSE NO','FLAT NO':'HOUSE NO','H NO':'HOUSE NO','F NO':'HOUSE NO','ROOM NO':'HOUSE NO','FLAT':'HOUSE NO','HN':'HOUSE NO','HOUSENO':'HOUSE NO','PROPNO':'HOUSE NO','SHOPNO':'HOUSE NO','FTNO':'HOUSE NO','SNO':'HOUSE NO','PVTNO':'HOUSE NO','FLATE':'HOUSE NO','HO NOB':'HOUSE NO B','H NOC':'HOUSE NO C','KHASRA NO':'KHASRA','KH NO':'KHASRA','K NO':'KHASRA','KILLA NO':'KHASRA','LGF':'LOWERGROUND FLOOR','LG F':'LOWERGROUND FLOOR','L GF':'LOWERGROUND FLOOR','NEW NO':'NEW','19 TH':'NINETEENTH','9 TH':'NINTH','90 TH':'NINTIETH','OLDNO':'OLD','OLD NO':'OLD','PARK NO':'PARK','PLOTNO':'PLOT','PLOT NO':'PLOT','P NO':'PLOT','PNO':'PLOT','POLT':'PLOT','PN':'PLOT','PLOR':'PLOT','PLOY':'PLOT','PLOAT':'PLOT','PLAOT':'PLOT','PLT':'PLOT','2 ND':'SECOND','IIF':'SECOND FLOOR','II F':'SECOND FLOOR','SF':'SECOND FLOOR','S F':'SECOND FLOOR','7 TH':'SEVENTH','VIIF':'SEVENTH FLOOR','VII F':'SEVENTH FLOOR','16 TH':'SIXTEENTH','6 TH':'SIXTH','VIF':'SIXTH FLOOR','VI F':'SIXTH FLOOR','60 TH':'SIXTIETH','GALNO':'STREET','GALINO':'STREET','STNO':'STREET','GALI NO':'STREET','STREET NO':'STREET','LANE NO':'STREET','GALI':'STREET','GLII':'STREET','GLI':'STREET','GNO':'STREET','GN':'STREET','G NO':'STREET','STREETNO':'STREET','GATENO':'STREET','10 TH':'TENTH','3 RD':'THIRD','IIIF':'THIRD FLOOR','III F':'THIRD FLOOR','TF':'THIRD FLOOR','T F':'THIRD FLOOR','THF':'THIRD FLOOR','TH F':'THIRD FLOOR','RDFLOOR':'THIRD FLOOR','THFLOOR':'THIRD FLOOR','13 TH':'THIRTEENTH','30 TH':'THIRTIETH','38 TH':'THIRTYEIGHTH','TOWER NO':'TOWER','12 TH':'TWELFTH','UGF':'UPPERGROUND FLOOR','UG F':'UPPERGROUND FLOOR','U GF':'UPPERGROUND FLOOR','11 ST':'ELEVENTH','12 ND':'TWELFTH','13 RD':'THIRTEENTH','17 TH':'SEVENTEENTH','18 TH':'EIGHTEENTH','20 TH':'TWENTIETH','21 ST':'TWENTYFIRST','22 ND':'TWENTYSECOND','23 RD':'TWENTYTHIRD','24 TH':'TWENTYFOURTH','25 TH':'TWENTYFIFTH','26 TH':'TWENTYSIXTH','27 TH':'TWENTYSEVENTH','28 TH':'TWENTYEIGHTH','29 TH':'TWENTYNINTH','31 ST':'THIRTYFIRST','32 ND':'THIRTYSECOND','33 RD':'THIRTYTHIRD','34 TH':'THIRTYFOURTH','35 TH':'THIRTYFIFTH','36 TH':'THIRTYSIXTH','37 TH':'THIRTYSEVENTH','39 TH':'THIRTYNINTH','40 TH':'FORTIETH','41 ST':'FORTYFIRST','42 ND':'FORTYSECOND','43 RD':'FORTYTHIRD','44 TH':'FORTYFOURTH','45 TH':'FORTYFIFTH','46 TH':'FORTYSIXTH','47 TH':'FORTYSEVENTH','49 TH':'FORTYNINTH','50 TH':'FIFTIETH','51 ST':'FIFTYFIRST','52 ND':'FIFTYSECOND','53 RD':'FIFTYTHIRD','54 TH':'FIFTYFOURTH','55 TH':'FIFTYFIFTH','56 TH':'FIFTYSIXTH','57 TH':'FIFTYSEVENTH','58 TH':'FIFTYEIGHTH','59 TH':'FIFTYNINTH','61 ST':'SIXTYFIRST','62 ND':'SIXTYSECOND','63 RD':'SIXTYTHIRD','64 TH':'SIXTYFOURTH','65 TH':'SIXTYFIFTH','66 TH':'SIXTYSIXTH','67 TH':'SIXTYSEVENTH','68 TH':'SIXTYEIGHTH','69 TH':'SIXTYNINTH','70 TH':'SEVENTIETH','71 ST':'SEVENTYFIRST','72 ND':'SEVENTYSECOND','73 RD':'SEVENTYTHIRD','74 TH':'SEVENTYFOURTH','75 TH':'SEVENTYFIFTH','76 TH':'SEVENTYSIXTH','77 TH':'SEVENTYSEVENTH','78 TH':'SEVENTYEIGHTH','79 TH':'SEVENTYNINTH','80 TH':'EIGHTIETH','81 ST':'EIGHTYFIRST','82 ND':'EIGHTYSECOND','83 RD':'EIGHTYTHIRD','84 TH':'EIGHTYFOURTH','85 TH':'EIGHTYFIFTH','86 TH':'EIGHTYSIXTH','87 TH':'EIGHTYSEVENTH','88 TH':'EIGHTYEIGHTH','89 TH':'EIGHTYNINTH','90 TH':'NINETIETH','91 ST':'NINETYFIRST','92 ND':'NINETYSECOND','93 RD':'NINETYTHIRD','94 TH':'NINETYFOURTH','95 TH':'NINETYFIFTH','96 TH':'NINETYSIXTH','97 TH':'NINETYSEVENTH','98 TH':'NINETYEIGHTH','99 TH':'NINETYNINTH','100 TH':'HUNDREDTH','1ST':'FIRST','2ND':'SECOND','3RD':'THIRD','4TH':'FOURTH','5TH':'FIFTH','6TH':'SIXTH','7TH':'SEVENTH','8TH':'EIGHTH','9TH':'NINTH','10TH':'TENTH','11TH':'ELEVENTH','12TH':'TWELFTH','13TH':'THIRTEENTH','14TH':'FOURTEENTH','15TH':'FIFTEENTH','16TH':'SIXTEENTH','17TH':'SEVENTEENTH','18TH':'EIGHTEENTH','19TH':'NINETEENTH','20TH':'TWENTIETH','21ST':'TWENTYFIRST','22ND':'TWENTYSECOND','23RD':'TWENTYTHIRD','24TH':'TWENTYFOURTH','25TH':'TWENTYFIFTH','26TH':'TWENTYSIXTH','27TH':'TWENTYSEVENTH','28TH':'TWENTYEIGHTH','29TH':'TWENTYNINTH','30TH':'THIRTIETH','31ST':'THIRTYFIRST','32ND':'THIRTYSECOND','33RD':'THIRTYTHIRD','34TH':'THIRTYFOURTH','35TH':'THIRTYFIFTH','36TH':'THIRTYSIXTH','37TH':'THIRTYSEVENTH','38TH':'THIRTYEIGHTH','39TH':'THIRTYNINTH','40TH':'FORTIETH','41ST':'FORTYFIRST','42ND':'FORTYSECOND','43RD':'FORTYTHIRD','44TH':'FORTYFOURTH','45TH':'FORTYFIFTH','46TH':'FORTYSIXTH','47TH':'FORTYSEVENTH','48TH':'FORTYEIGHTH','49TH':'FORTYNINTH','50TH':'FIFTIETH','51ST':'FIFTYFIRST','52ND':'FIFTYSECOND','53RD':'FIFTYTHIRD','54TH':'FIFTYFOURTH','55TH':'FIFTYFIFTH','56TH':'FIFTYSIXTH','57TH':'FIFTYSEVENTH','58TH':'FIFTYEIGHTH','59TH':'FIFTYNINTH','60TH':'SIXTIETH','61TH':'SIXTYFIRST','62ND':'SIXTYSECOND','63RD':'SIXTYTHIRD','64TH':'SIXTYFOURTH','65TH':'SIXTYFIFTH','66TH':'SIXTYSIXTH','67TH':'SIXTYSEVENTH','68TH':'SIXTYEIGHTH','69TH':'SIXTYNINTH','70TH':'SEVENTIETH','71ST':'SEVENTYFIRST','72ND':'SEVENTYSECOND','73RD':'SEVENTYTHIRD','74TH':'SEVENTYFOURTH','75TH':'SEVENTYFIFTH','76TH':'SEVENTYSIXTH','77TH':'SEVENTYSEVENTH','78TH':'SEVENTYEIGHTH','79TH':'SEVENTYNINTH','80TH':'EIGHTIETH','81ST':'EIGHTYFIRST','82ND':'EIGHTYSECOND','83RD':'EIGHTYTHIRD','84TH':'EIGHTYFOURTH','85TH':'EIGHTYFIFTH','86TH':'EIGHTYSIXTH','87TH':'EIGHTYSEVENTH','88TH':'EIGHTYEIGHTH','89TH':'EIGHTYNINTH','90TH':'NINETIETH','91ST':'NINETYFIRST','92ND':'NINETYSECOND','93RD':'NINETYTHIRD','94TH':'NINETYFOURTH','95TH':'NINETYFIFTH','96TH':'NINETYSIXTH','97TH':'NINETYSEVENTH','98TH':'NINETYEIGHTH','99TH':'NINETYNINTH','100TH':'HUNDREDTH'}
repl_dict = {}
repl_dict.update(repl_dict0)
repl_dict.update(repl_dict1)
repl_dict.update(repl_dict2)
repl_dict.update(repl_dict3)
repl_dict.update(plot_dict)
repl_dict.update(street_dict)
len(repl_dict)

592

## <font color=aqua>Func to remove unwanted words from corpus</font>

In [11]:
def remov_list(frame, col):
    frame[col] = frame[col].astype(str)
    for word in tqdm(rem_list, desc="ADDRESS processed", colour='green'):
        frame[col] = frame[col].str.replace(f'{word}', "")

## <font color=aqua>Check for word ID</font>

In [12]:
def word_id(word):
    word = word.upper()
    if word in plot_list:
        return 'PLOT MARKER'
    elif word in street_list:
        return 'STREET MARKER'
    elif word in FLOOR_list:
        return 'FLOOR MARKER'

## <font color=aqua>To fetch words before and after a word in a string <font color=red >(sir's code)</font></font>

In [13]:
def fetch_word_bef_aft_string(kw,str1,mode=0):
    b = ''
    a = ''
    loc = ''
    try:

        ix = str1.find(kw)
        l = len(kw)
        ch_b = ''
        ch_a = ''

        if ix == -1:
            b = ''
            a = ''
            loc = 'N'
        elif l == len(str1):
            b = ''
            a = ''
            loc = 'S'
        elif ix == 0:
            b = ''
            ch_a = str1[ix+l]
            loc = 'S'
            if ch_a == ' ':
                a = kw + ' ' + str1[ix+l:].split()[0]
            else:
                if str1.find(' ') != -1:
                    a = kw + str1[ix+l:str1.find(' ')]
                else:
                    a = kw + str1[ix+l:]
        elif ix + l == len(str1):
            a = ''
            loc = 'E'
            ch_b = str1[ix-1]
            if ch_b == ' ':
                b = str1[:ix].split()[-1] + ' ' + kw
            else:
                if str1[:ix].rfind(' ') != -1:
                    b = str1[str1[:ix].rfind(' ')+1:ix] + kw
                else:
                    b = str1[:ix]+kw
        else:
            ch_b = str1[ix-1]
            ch_a = str1[ix+l]
            loc = 'M'
            if ch_a == ' ':
                if ch_b == ' ':
                    b = b = str1[:ix].split()[-1] + ' ' + kw
                    a = kw + ' ' + str1[ix+l:].split()[0]
                else:
                    b = str1[str1[:ix].rfind(' ')+1:ix] + kw
                    a = str1[str1[:ix].rfind(' ')+1:ix] + \
                        kw + ' ' + str1[ix+l:].split()[0]
            else:
                if ch_b == ' ':
                    b = str1[:ix].split()[-1] + ' ' + kw + \
                        str1[ix+l:str1[ix+l:].find(' ')+ix+l]
                    a = kw + str1[ix+l:str1[ix+l:].find(' ')+ix+l]
                else:
                    b = str1[str1[:ix].rfind(' ')+1:ix] + kw + \
                        str1[ix+l:str1[ix+l:].find(' ')+ix+l]
                    a = str1[str1[:ix].rfind(' ')+1:ix] + kw + \
                        str1[ix+l:str1[ix+l:].find(' ')+ix+l]
    except:
        pass
    
    if mode==1:
        return b, a, loc
    else:
        return b, a

## <font color=aqua>To fetch address markers from an address <font color=red>(sir's code variant + personal)</font></font>

In [14]:
############################ Fetch House number ############################
def fetch_PLOT(z):
    pattern = re.compile(r"(\s+([0-9]+\s+)+)", re.IGNORECASE)
    z = "temp "+z
    extracted_substring = re.search(pattern, z)
    if extracted_substring == None:
        return None
    else:
        extracted_substring = extracted_substring.group()
    return extracted_substring


############################ Fetch Floor ############################
def fetch_FLOOR(add):
    try:
        if fetch_word_bef_aft_string('FLOOR',add,mode=1)[2]!='S':
            out = fetch_word_bef_aft_string('FLOOR',add)[0]
            return out.split()[0], ' '.join([x for x in add.split() if x not in (out.split())])
        else:
            out = fetch_word_bef_aft_string('FLOOR',add)[1]
            return out.split()[-1], ' '.join([x for x in add.split() if x not in (out.split())])
    except:
        return None,' '.join([x for x in add.split() if x != 'FLOOR'])
    
############################ Fetch Street ############################
def fetch_STREET(add):
    try:
        out = fetch_word_bef_aft_string('STREET',add)[1]
        if (out.split()[-1]).isdigit()==True:
            return out.split()[-1], ' '.join([x for x in add.split() if x not in (out.split())])
        else:
            out2 = fetch_word_bef_aft_string('STREET',add)[0]
            return out2.split()[0], ' '.join([x for x in add.split() if x not in (out2.split())])
            
    except:
        return None,' '.join([x for x in add.split() if x != 'STREET'])
    
############################ Fetch Pocket ############################
def fetch_POCKET(add):
    try:
        out = fetch_word_bef_aft_string('POCKET',add)[1]
        return out.split()[-1], ' '.join([x for x in add.split() if x not in (out.split())])            
    except:
        return None,' '.join([x for x in add.split() if x != 'POCKET'])
############################ Fetch Block ############################
def fetch_BLOCK(add):
    try:
        out = fetch_word_bef_aft_string('BLOCK',add)[1]
        if len(out.split()[-1])<=2:
            return out.split()[-1], ' '.join([x for x in add.split() if x not in (out.split())])
        else:
            out2 = fetch_word_bef_aft_string('BLOCK',add)[0]
            return out2.split()[0], ' '.join([x for x in add.split() if x not in (out2.split())])
    except:
        return None,' '.join([x for x in add.split() if x != 'BLOCK'])

 .

 .

.

# <font size="10" color=red>Main Code</font>

## <font color = aqua>Reading Dataset and creating Dataframe</font>

In [18]:
df = pd.read_csv(r"..\DATA\NE_Circle_consumer_mas.txt",sep="\t",header = 0)
df["ADDRESS_ORG"]=df["ADDRESS"].apply(lambda z: str(z).upper())
df.drop(['ADDRESS'],axis=1,inplace=True)
df.head()

Unnamed: 0,SDO_CD,CA_NO,ADDRESS_ORG
0,1211,101363856.0,"#, EXTN,478 KALANDER KOLONY, DILSHAD GARDEN, SHAHDARA, NEAR RED CROSS HOSPITAL, 110095"
1,1211,101296041.0,"#, 1438-A-45 GALI-4, #, BALBIR NAGAR EXTN SHAHDARA, #, 110032"
2,1211,101358487.0,"#, 1/5484 GALI NO-17, #, BALBIR NAGAR EXTN, #, 110032"
3,1211,101358755.0,"#, 1/6221 GALI NO-4, #, EAST ROHTASH NAGAR SHAHDARA, #, 110032"
4,1211,101220086.0,"T-10, #, #, NAVEEN SHAHDARA, #, 110032"


.

## <font color = aqua> Data Cleaning</font>

In [19]:
############################ Manually removing visible garbage values ############################
df.drop(index=[112925],axis="index",inplace=True)

############################ Checking and handling null in the dataframe ############################
for c in df.select_dtypes(float):
    df[c] = df[c].fillna(0)
    
############################ Inserting space between numerals and characters ############################
df['ADDRESS'] = df['ADDRESS_ORG'].apply(lambda x: insert_spaces(r'[0-9]+',x))

df.head()

Unnamed: 0,SDO_CD,CA_NO,ADDRESS_ORG,ADDRESS
0,1211,101363856.0,"#, EXTN,478 KALANDER KOLONY, DILSHAD GARDEN, SHAHDARA, NEAR RED CROSS HOSPITAL, 110095","#, EXTN, 478 KALANDER KOLONY, DILSHAD GARDEN, SHAHDARA, NEAR RED CROSS HOSPITAL, 110095"
1,1211,101296041.0,"#, 1438-A-45 GALI-4, #, BALBIR NAGAR EXTN SHAHDARA, #, 110032","#, 1438 -A- 45 GALI- 4 , #, BALBIR NAGAR EXTN SHAHDARA, #, 110032"
2,1211,101358487.0,"#, 1/5484 GALI NO-17, #, BALBIR NAGAR EXTN, #, 110032","#, 1 / 5484 GALI NO- 17 , #, BALBIR NAGAR EXTN, #, 110032"
3,1211,101358755.0,"#, 1/6221 GALI NO-4, #, EAST ROHTASH NAGAR SHAHDARA, #, 110032","#, 1 / 6221 GALI NO- 4 , #, EAST ROHTASH NAGAR SHAHDARA, #, 110032"
4,1211,101220086.0,"T-10, #, #, NAVEEN SHAHDARA, #, 110032","T- 10 , #, #, NAVEEN SHAHDARA, #, 110032"


In [17]:
############################ Replacing all special characters with space ############################
df['ADDRESS_ORG'] = df['ADDRESS_ORG'].apply(lambda x: x.replace("\\","/"))
df["ADDRESS_ORG"] = df["ADDRESS_ORG"].apply(lambda z:re.sub('[^A-Za-z0-9-/]', ' ', str(z)))
df["ADDRESS"] = df["ADDRESS"].apply(lambda z:re.sub('[^A-Za-z0-9]', ' ', str(z)))
df["ADDRESS_ORG"] = df["ADDRESS_ORG"].apply(lambda z:re.sub(r'\s*([-/])\s*', r'\1', str(z)))

############################ Creating address part markers ############################
df["PLOT"] = df['FLOOR'] = df['BLOCK'] = df['STREET'] = df["POCKET"] = None

############################ Moving pincodes to another column ############################
reg=re.compile(r'(1100\d\d)')
extract_pin(df,"ADDRESS","PINCODE",reg)

############################ Validating Data samples ############################
df.head()

Unnamed: 0,SDO_CD,CA_NO,ADDRESS_ORG,ADDRESS,PLOT,FLOOR,BLOCK,STREET,POCKET,PINCODE
0,1211,101363856.0,EXTN 478 KALANDER COLONY CLONI KOLONY DILSHAD GARDEN SHAHDARA NEAR RED CROSS HOSPITAL 110095,EXTN 478 KALANDER COLONY CLONI KOLONY DILSHAD GARDEN SHAHDARA NEAR RED CROSS HOSPITAL,,,,,,110095
1,1211,101296041.0,1438-A-45 GALI-4 BALBIR NAGAR EXTN SHAHDARA 110032,1438 A 45 GALI 4 BALBIR NAGAR EXTN SHAHDARA,,,,,,110032
2,1211,101358487.0,1/5484 GALI NO-17 BALBIR NAGAR EXTN 110032,1 5484 GALI NO 17 BALBIR NAGAR EXTN,,,,,,110032
3,1211,101358755.0,1/6221 GALI NO-4 EAST ROHTASH NAGAR SHAHDARA 110032,1 6221 GALI NO 4 EAST ROHTASH NAGAR SHAHDARA,,,,,,110032
4,1211,101220086.0,T-10 NAVEEN SHAHDARA 110032,T 10 NAVEEN SHAHDARA,,,,,,110032


In [105]:
df.sample(5)

Unnamed: 0,SDO_CD,CA_NO,ADDRESS_ORG,ADDRESS,PLOT,FLOOR,STREET,POCKET,BLOCK
262979,1251,153919536.0,"C-174/B, OLD NOC-174/A SECOND FLOOR, GALINO1, CHAND BAGH, #, 110094",C 174 B OLD NOC 174 A GALINO 1 C BLOCK CHAND BAGH 110094,174.0,SECOND,,,C-BLOCK
384708,1260,150174937.0,"B-257/12, ,OLD-(PART-257/12), GALI NO-7, ASHOK NAGAR SHAHDARA, NEAR SUKER BAZAR CHOWK, 110093",B 257 12 OLD PART 257 12 GALI NO 7 ASHOK NAGAR SHAHDARA NEAR SUKER BAZAR CHOWK 110093,257.0,,,,
355455,1251,101263237.0,"N-202, #, G NO-10, SADAT PUR EXTN, #, 110094",N 202 G NO 10 SADAT PUR EXTN 110094,202.0,,,,
203891,1211,101190810.0,"#, #, #, VILL CHHAJJUPUR SHAHDARA, #, 110032",VILL CHHAJJUPUR SHAHDARA 110032,,,,,
74197,1260,101631474.0,"#, SH GULAB SINGH, KH NO 37/2,B-267,GALI NO 9,HARSH VIHAR, #, #, 110093",SH GULAB SINGH KH NO 37 2 B 267 GALI NO 9 HARSH VIHAR 110093,37.0,,,,


## <font color=aqua> Removing unnecessary info from the address </font>

In [None]:
remov_list(df,'ADDRESS')

ADDRESS processed: 100%|[32m██████████[0m| 15/15 [00:07<00:00,  2.03it/s]


## <font color=aqua> Replacing known words and abbreviation in address from repl_dict  </font>

In [None]:
for k in tqdm(repl_dict.keys(),desc="Replacing Words from repl_dict",colour='green'):
    df['ADDRESS'] = df['ADDRESS'].apply(lambda x:x.replace(' ' + k + ' ',' ' + repl_dict[k] + ' '))
df.reset_index(drop=True,inplace=True)

Replacing Words from repl_dict:   0%|[32m          [0m| 0/592 [00:00<?, ?it/s]

Replacing Words from repl_dict: 100%|[32m██████████[0m| 592/592 [09:49<00:00,  1.00it/s]


## <font color = aqua>Segregating address markers</font>

In [21]:
%%time
df["PLOT"]=df["ADDRESS"].apply(lambda z: fetch_PLOT(z))

df['FLOOR']= df['ADDRESS'].apply(lambda x: fetch_FLOOR(x)[0])
df['ADDRESS'] = df['ADDRESS'].apply(lambda x:fetch_FLOOR(x)[1])

df['STREET'] = df['ADDRESS'].apply(lambda x: fetch_STREET(x)[0])
df['ADDRESS'] = df['ADDRESS'].apply(lambda x:fetch_STREET(x)[1])

df['POCKET'] = df['ADDRESS'].apply(lambda x: fetch_POCKET(x)[0])
df['ADDRESS_ORG'] = df['ADDRESS_ORG'].apply(lambda x:fetch_POCKET(x)[1])

df['BLOCK'] = df['ADDRESS'].apply(lambda x: fetch_BLOCK(x)[0])
df['ADDRESS_ORG'] = df['ADDRESS_ORG'].apply(lambda x:fetch_BLOCK(x)[1])

CPU times: total: 50.2 s
Wall time: 54.8 s


In [None]:
df.isna().sum()

SDO_CD              0
CA_NO               0
ADDRESS_ORG         0
ADDRESS             0
PLOT            11947
FLOOR          497411
BLOCK          880683
STREET         385457
POCKET         910958
PINCODE          6650
dtype: int64

In [79]:
df['ADDRESS_ORG','PLOT',"FLOOR",'FLOOR','BLOCK'].iloc[83937]

KeyError: ('ADDRESS_ORG', 'PLOT', 'FLOOR', 'FLOOR', 'BLOCK')

In [87]:
df.sample(5)

Unnamed: 0,SDO_CD,CA_NO,ADDRESS_ORG,ADDRESS,PLOT,FLOOR,STREET,POCKET,BLOCK
55370,1251,153752574.0,"B-2415-A, GROUND FLOOR KH 305,310 & 316, MAIN 25 FUTTA ROAD, SONIA VIHAR KARAWAL NAGAR, #, 110094","B- 2415 -A, KH 305 , 310 & 316 , MAIN 25 FUTTA ROAD, B BLOCK, SONIA VIHAR KARAWAL NAGAR, #, 110094",2415,GROUND,,,B
791168,1250,101402401.0,"#, W/O MAHESH KUMAR, D-201 KHN 498/149 MAIN ROAD,MAUJPUR, #, #, 110053","#, W/O MAHESH KUMAR, D- 201 KHN 498 / 149 MAIN ROAD,MAUJPUR, #, #, 110053",201,,,,
195057,1211,101266477.0,"1/42-B, #, #, DILSHAD GARDEN SHAHDARA, #, 110095","1 / 42 -B, #, #, DILSHAD GARDEN SHAHDARA, #, 110095",1,,,,
270788,1251,101371680.0,"#, OLD B-463-NEW B-449 GALI NO-6, #, PREM VIHAR KARAWAL NAGAR, #, 110094","#, OLD B- 463 -NEW B- 449 GALI NO- 6 , #, PREM VIHAR KARAWAL NAGAR, #, 110094",463,,,,
768950,1260,151990009.0,"D-19, S/F,KH NO 473,, EAST JYOTI NAGAR, #, 110093","D- 19 , S/F,KH NO 473 ,, D-BLOCK, EAST JYOTI NAGAR, #, 110093",19,,,,"D-BLOCK,"


.

## <font color = aqua>Replacing all similar words with the most frequent one </font>

## creating a word corpus <font color = green> [df_corpus] </font> and making various columns for it

In [106]:
w = fetch_words(df,'ADDRESS', all_words=True)
df_corpus = pd.DataFrame(zip(w.keys(),w.values()),columns = ['WORD','WORD_COUNT'])
df_corpus['WORD_TYPE'] = df_corpus['WORD'].apply(lambda z:ifnum(z))
df_corpus['WORD_LENGTH'] = df_corpus['WORD'].apply(lambda z:len(z))
df_corpus["WORD"] = df_corpus["WORD"].apply(lambda z:re.sub(r'NULLNULL\b', ' ', str(z)))
newPrint(str(df_corpus.shape))
df_corpus.head()

### <font color=green>(53518, 4)</font> <font color=red><b><em></em></b></font>

Unnamed: 0,WORD,WORD_COUNT,WORD_TYPE,WORD_LENGTH
0,EXTN,52899,NonNumeric,4
1,478,645,Numeric,3
2,KALANDER,819,NonNumeric,8
3,KOLONY,2,NonNumeric,6
4,DILSHAD,36923,NonNumeric,7


## <font color = aqua>Making a word only Dataframe <font color = green> [wordonly_corpus] </font> </font>
### (from corpus that are NonNumeric and bigger/equal to 3)

In [108]:
thresh_list=word_size_list=word_list =  []
############################ Customizing wordonly dataframe ############################
# wordonly_corpus = df_corpus[(df_corpus['WORD_TYPE']=='NonNumeric')&(df_corpus['WORD_LENGTH']>=3)][["WORD","WORD_LENGTH",'WORD_COUNT']]
wordonly_corpus = df_corpus[(df_corpus['WORD_TYPE']=='NonNumeric')&(df_corpus['WORD_LENGTH']>=3)][["WORD","WORD_LENGTH",'WORD_COUNT']].sort_values(by='WORD_LENGTH',ascending=0)
wordonly_corpus["FZ_THRESH"] = df_corpus["WORD_LENGTH"].apply(lambda z: int(100-100/(z/2)))  #Adding the calculated fuzzy threshold values for each word to a column 
wordonly_corpus["NO. OF MATCHES"] = 0
wordonly_corpus["BEST_MATCHES"] = ""
############################ exporting column to a various different lists ############################
thresh_list = wordonly_corpus["FZ_THRESH"].tolist()
word_count_dict = wordonly_corpus["WORD_COUNT"].tolist()
word_list = wordonly_corpus["WORD"].tolist()
word_size_list =  wordonly_corpus["WORD_LENGTH"].tolist()
word_list = [str.strip(i) for i in word_list]
# word_count_dict = [list(item) for item in zip(word_list , word_count_dict)]
word_count_dict = dict(zip(word_list,word_count_dict))
############################ Resetting and losing weights from dataframe ############################
wordonly_corpus.drop(["WORD_LENGTH","FZ_THRESH",'WORD_COUNT'],axis=1,inplace=True)
wordonly_corpus.reset_index(drop=True, inplace=True)
############################ Removing NULL from end of words ############################
wordonly_corpus["WORD"] = wordonly_corpus["WORD"].apply(lambda z:re.sub(r'NULLNULL\b', ' ', str(z)))

wordonly_corpus.head()

Unnamed: 0,WORD,NO. OF MATCHES,BEST_MATCHES
0,OPPOSITESENIORSECOUNDARYSCHOOL,0,
1,AMBEYENCLAVECHAUHANPATTISABHA,0,
2,SCHOOLJAVASCRIPTDATESELECTED,0,
3,MUKANDVIHARCHOWKCYCLEFACTORY,0,
4,BHAGATSINGHMOHLLANEWUSMANPUR,0,


## <font color = aqua>Finding similar words in corpus through fuzzy and adding to the word only dataframe</font>

rowindex = 0

for i in tqdm(range(len(word_size_list)), desc="Processing words", colour="green"):
    best_matches = []
    no_of_matches = 0
    
    for j in range(len(word_size_list)):
        if word_list[j] == word_list[i] or (word_size_list[j] > word_size_list[i]+2):
            continue
        elif word_size_list[j] < (word_size_list[i]-2):
            break
        else:
            score = fuzz.ratio(word_list[i],word_list[j])
            if score >= max(83,thresh_list[i]):
                best_matches.append(word_list[j])
                # print(best_matches,word_list[i])
                no_of_matches += 1
            else:
                continue
       
    if len(best_matches) > 0:
        wordonly_corpus.loc[i,"BEST_MATCHES"] = " ".join(best_matches)
        wordonly_corpus.loc[i, "NO. OF MATCHES"] = no_of_matches
    else:
        continue
    
    rowindex += 1
    
wordonly_corpus.reset_index(drop=True, inplace=True)

In [None]:
wordonly_corpus.head(15)

Unnamed: 0,WORD,NO. OF MATCHES,BEST_MATCHES
0,OPPOSITESENIORSECOUNDARYSCHOOL,0,
1,AMBEYENCLAVECHAUHANPATTISABHA,0,
2,SCHOOLJAVASCRIPTDATESELECTED,0,
3,RAJDHANIPUBLICSCHOOLWALIGALI,0,
4,BHAGATSINGHMOHLLANEWUSMANPUR,0,
5,MUKANDVIHARCHOWKCYCLEFACTORY,0,
6,AMBEDKARBASTIGHONDA,0,
7,VIDHYALAYASEELAMPURSHAHDARA,0,
8,MASJIDWALIGALITIRPALFECTORY,0,
9,IMRATKIRANASTIRESHIVMANDIR,0,


## <font color = aqua>Exporting to CSVs</font>

wordonly_corpus.to_csv(r"D:\Desktop\BYPL\DATA\better_similar_matches.csv", sep=',', index=False)
#if wanted an indexed csv file
# wordonly_corpus.to_csv(r"D:\Desktop\BYPL\DATA\better_indexed_similar_matches.csv", sep=',', index=True) 

In [109]:
wordonly_corpus = pd.read_csv(r"D:\Desktop\BYPL\DATA\new_better_similar_matches.csv", sep=',', header=0 )
wordonly_corpus = wordonly_corpus[wordonly_corpus['NO. OF MATCHES']>0][["WORD","BEST_MATCHES"]]

wordonly_corpus.head()

Unnamed: 0,WORD,BEST_MATCHES
11,BHAJANPURASHAHDARANULLNULL,BHAJANPURISHAHDARANULLNULL BHAJANPURSHAHDARANULLNULL
12,BHAJANPURISHAHDARANULLNULL,BHAJANPURASHAHDARANULLNULL BHAJANPURSHAHDARANULLNULL
14,ZAFARABADSHAHDARANULLNULL,JAFFARABASHAHDARANULLNULL ZAFRABADSHAHDARANULLNULL ZAFRABADSHADHARANULLNULL ZAFRABASHAHDARANULLNULL
17,BRAHAMPURSHAHDARANULLNULL,BRAHMPURISHAHDARANULLNULL BRAHMPURSHAHDARANULLNULL
19,JAFFARABASHAHDARANULLNULL,ZAFARABADSHAHDARANULLNULL ZAFRABASHAHDARANULLNULL


## <font color = aqua>Reading data from similar words CSV</font>

In [None]:
wordonly_corpus = pd.read_csv(r"D:\Desktop\BYPL\DATA\new_better_similar_matches.csv", sep=',', header=0 )

############################ Removing NULL suffix(s) from the data set, if present ############################
wordonly_corpus["BEST_MATCHES"] = wordonly_corpus["BEST_MATCHES"].apply(lambda z:re.sub(r'NULLNULL\b', ' ', str(z)))
wordonly_corpus["WORD"] = wordonly_corpus["WORD"].apply(lambda z:re.sub(r'NULLNULL\b', ' ', str(z)))

############################ Keeping neccessary/useful word only ############################
wordonly_corpus = wordonly_corpus[wordonly_corpus['NO. OF MATCHES']>0][["WORD","BEST_MATCHES"]]
wordonly_corpus.reset_index(drop=True,inplace=True)

wordonly_corpus.head()

Unnamed: 0,WORD,BEST_MATCHES
0,BHAJANPURASHAHDARA,BHAJANPURISHAHDARA BHAJANPURSHAHDARA
1,BHAJANPURISHAHDARA,BHAJANPURASHAHDARA BHAJANPURSHAHDARA
2,ZAFARABADSHAHDARA,JAFFARABASHAHDARA ZAFRABADSHAHDARA ZAFRABADSHADHARA ZAFRABASHAHDARA
3,BRAHAMPURSHAHDARA,BRAHMPURISHAHDARA BRAHMPURSHAHDARA
4,JAFFARABASHAHDARA,ZAFARABADSHAHDARA ZAFRABASHAHDARA


In [None]:
#test
word_count_dict["WAZERA"]

1

In [None]:
wordonly_corpus.isna().sum()

WORD            0
BEST_MATCHES    0
dtype: int64

In [None]:
a = ['BHAJANPURASHAHDARAlkjhgfds', 'BHAJANPURISHAHDARA', 'BHAJANPURSHAHDARA']
b = {'BHAJANPURASHAHDARAlkjhgfds': 18, 'BHAJANPURISHAHDARA': 5, 'BHAJANPURSHAHDARA': 15}

max_word = max(a, key=lambda x: b[x])
print(max_word)

BHAJANPURASHAHDARAlkjhgfds


In [None]:
def word_repl(list,word,string):
    for z in list:
        if z in string:
            return string.replace(f" {z} ",f" {word} ")
        else:
            continue

In [None]:
%%time
processed_words = set()
for i in tqdm(range(len(wordonly_corpus)),desc="Replaced Words",colour='green'):
    if wordonly_corpus['WORD'].iloc[i] not in processed_words:
        temp_word_list = (wordonly_corpus['BEST_MATCHES'].iloc[i] + wordonly_corpus['WORD'].iloc[2]).strip().split()
        best = max(temp_word_list, key=lambda x: word_count_dict[x])
        processed_words.update(k)
        temp_word_list.remove(best)
        df['ADDRESS_ORG'] = df['ADDRESS_ORG'].apply(lambda z: word_repl(temp_word_list,best,z))
        
    else:  
        continue
    

Replaced Words:   0%|[32m          [0m| 0/33669 [00:00<?, ?it/s]

Replaced Words:   0%|[32m          [0m| 1/33669 [00:00<8:42:28,  1.07it/s]


TypeError: argument of type 'NoneType' is not iterable

In [121]:
# z = "ooooooooooo"
# zz = ['BHAJANPURASHAHDARAlkjhgfds', 'BHAJANPURISHAHDARA', 'BHAJANPURSHAHDARA']
# zzz = "BHAJANPURASHAHDARAlkjhgfds"
# str = "asdfghjkqwertyui dfghjkl tyuio dtrctytvhbjnk ydtrchjvhbjn BHAJANPURISHAHDARA r cytbvjygnhmukjl sytdghj kd 5fbygunki bvyjnbhmk BHAJANPURASHAHDARAlkjhgfds ftjgbhn d fuighjom, d ufi6 gunhjmok 6rfuygbhj BHAJANPURSHAHDARA o8yivg"

# str.replace(f" {zzz} ",f" {z} ")
# ans = word_repl(zz,z,str)
# print(ans)

df.sample(5)

Unnamed: 0,SDO_CD,CA_NO,ADDRESS_ORG,ADDRESS,PLOT,FLOOR,STREET,POCKET,BLOCK
505617,1251,101521816.0,"H NO-1081, GALI NO-13, KH NO-379,RAJEEV GANDHI NGR,, NEW MUSTAFA BAD,DELHI, NEAR MADNI MASJID, 110094",H NO 1081 GALI NO 13 KH NO 379 RAJEEV GANDHI NGR NEW MUSTAFA BAD DELHI NEAR MADNI MASJID 110094,1081,,,,
79206,1250,153809011.0,"A-80/9, G/FLOOR, MAIN PUSTA ROAD, BHAJANPURA, #, 110053",A 80 9 A BLOCK MAIN PUSTA ROAD BHAJANPURA 110053,80,"G/FLOOR,",,,A-BLOCK
783784,1250,101492481.0,"C-1/8, #, #, IMAM BARA NEW SILAMPUR SHAHDARA, #, 110093",C 1 8 IMAM BARA NEW SILAMPUR SHAHDARA 110093,1,,,,
915743,1251,101343643.0,"A-1/18, OLD HNO-A-1, KHNO-289 & 292, KANHAIYA VIHAR, JOHRI PUR, DELHI, NEAR PNB BANK, 110094",A 1 18 OLD HNO A 1 KHNO 289 292 KANHAIYA VIHAR JOHRI PUR DELHI NEAR PNB BANK 110094,1,,,,
675945,1260,101561866.0,"O-537, #, #, SUNDER NAGRI, #, 110093",O 537 SUNDER NAGRI 110093,537,,,,
