## Import statements

In [45]:
import numpy as np
import pandas as pd
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy import displacy
from spacy.matcher import Matcher, PhraseMatcher

## Validating for Number of Rooms

### Test dataframe

In [46]:
test = pd.read_excel('FullListings_2018_Misclassifications (002).xlsx', sheet_name='Flagging_misclassifications')

### Number of rooms validating function

In [47]:
#three fold checker for rooms 
#check numbers, if there are multiple numbers, 
#check neighbhors if there is an 'in'
# check single room renting 

number_validate_dict = {"ONE":1, "TWO":2, "THREE":3,  "FOUR":4, "FIVE":5,"SIX":6, "SEVEN":7, "EIGHT":8,"NINE":9, "1BD": 1, "2BD":2, "1/BD":1, "2/BD":2 }

#validates number of rooms in a house
def validate_num_rooms(dataframe):
    
    """input = dataframe
    output = indeces of validated rooms, list of validated room numbers, and their respective rooms available 
    in format 'rooms_available / total_rooms', and T/F list for the actual value in the dataframe for bedrooms
    """
    
    num_rooms_global = []
    idx_matches = []
    correct_rooms = []
    
    
    #only checks title that has atmost 2 numbers 
    for x in dataframe.itertuples():
        title = (x.title)
        idx = x.Index
        title = nlp(title)
        total_rooms = x.numRooms
        #only looking at rooms_avail
        rooms_avail = x.bedrooms
    
        
        dep_tagged = [(word.dep_) for word in title] 
        #count how many numbers are in the title
        num_count = dep_tagged.count('nummod') + dep_tagged.count('nmod')
        if num_count <= 2 and num_count > 0:
            #if number of numbers is atmost 2 then continue
            
            dep_tagged = [word for word in title] 
            dep_tagged2 = [word.dep_ for word in title]
            
            num_rooms = []
            for i in range(len(dep_tagged2)):
                #search through title for nummods tag
                word = str(dep_tagged2[i]).lower()
                if word == 'nummod':
                    try:
                        #add number to list of num_rooms
                        num_rooms.append(number_validate_dict[str(dep_tagged[i])])
                    except KeyError:
                        pass
                else: 
                    pass
                
            if len(num_rooms) == 1:
                #need to check both cases, add index, and T/F, and rooms
                idx_matches.append(idx)
                if int(rooms_avail) == int(num_rooms[0]):
                    correct_rooms.append("T")
                else:
                    correct_rooms.append("F")
                num_rooms_global.append(str(num_rooms[0]) + "/" + str(num_rooms[0]))
                
            elif len(num_rooms) > 1:
                idx_matches.append(idx)
                num_rooms = sorted(num_rooms)
                num_rooms_global.append(str(num_rooms[0]) + "/" + str(num_rooms[1]))
                if int(rooms_avail) == int(num_rooms[0]):
                    correct_rooms.append("T")
                else:
                    correct_rooms.append("F")
            
        else:
            pass
    return idx_matches, num_rooms_global, correct_rooms

### Results for validator:

In [48]:
indeces, rooms, bool_list = validate_num_rooms(test)

In [49]:
print("Total number of titles: ", test.shape[0])
print("Total number of titles checked by function: ", len(indeces))
print("Total number of titles considered true by function in dataframe: ", bool_list.count("T"))
print("Total number of titles considered false by function in dataframe: ", bool_list.count("F"))

Total number of titles:  317
Total number of titles checked by function:  154
Total number of titles considered true by function in dataframe:  67
Total number of titles considered false by function in dataframe:  87


## Flagger for Misclassifications

### Compiling keywords from multiple sources

In [50]:
words = pd.read_excel('Cambridge_listings_2018_flagged.xlsx', sheet_name='Auto-Skip Phrases')
words = words['ROOM AVAILABLE IN'].tolist()
words_2 = pd.read_excel('FullListings_2018_Misclassifications (002).xlsx', sheet_name='keywords')

In [51]:
#Extarct keywords from the ones previously used

words_mis_camb = []

words_c1 = words_2['roomrent'].tolist()
words_c2 = words_2['sublet'][:3].tolist()
words_c3 = words_2['shortterm'][:1].tolist()
words_c4 = words_2['shared'][:3].tolist()

all_words = [words_c1,words_c2,words_c3,words_c4]

for w in all_words:
    words_mis_camb = words_mis_camb+w

In [52]:
#Cleaning keywords
words_mis_camb[3] = 'ONE ROOM IN'
words_mis_camb[4] = 'ONE BEDROOM IN'
words_mis_camb[5] = 'ONE BEDROOM AVAILABLE IN'
words_mis_camb[12] = 'ROOMS AVAILABLE IN'
words_mis_camb[15] = 'ROOM IN'
words_mis_camb[21] = 'BEDROOMS AVAILABLE IN'
words_mis_camb[23] = 'BEDROOMS OPEN IN'

In [53]:
words2 = pd.DataFrame(words_mis_camb, columns=['words'])
# creating a bool series from isin() 
isnotindf = words2[~words2["words"].isin(words)]
isnotindf = isnotindf['words'].tolist()
phrase_list = isnotindf + words

#### Complete compiling list of words

In [54]:
# Current list of keywords
# We can add to this list
phrase_list 

['ROOM RENT',
 'ROOMMATE',
 'ROOMIE',
 'ONE ROOM IN',
 'ONE BEDROOM IN',
 'ROOM AVAILABLE IN',
 'ONE BEDROOM IN',
 'ONE ROOM IN',
 'ROOMMATES NEEDED',
 'ROOM IN',
 'ROOM IN',
 'SUBLET',
 'SHORT TERM',
 'SHARED',
 'SHARE',
 'ONE BEDROOM AVAILABLE IN',
 'PRIVATE ROOM',
 'ONE ROOM AVAILABLE',
 'FURNISHED BEDROOM',
 'APARTMENT SHARING',
 'ROOMS AVAILABLE IN',
 'ROOMMATES',
 'PRIVATE BEDROOM',
 'RENTING ROOM',
 'MASTER BEDROOM IN',
 'BEDROOMS AVAILABLE IN',
 'ONE BEDROOM OPEN IN',
 'BEDROOMS OPEN IN',
 'SUBLEASING',
 'SUBLEASE']

### Function to flag partial units

In [55]:
def filter_listing(dataframe):
    
    '''
    Input Parameters: 1) pandas dataframe from which the index 
                         and title will be extracted 
                      2) keywords as a list - already compiled a list for reference
                      
    Output: Returns 1) index of flaggled listing
                    2) Keyword found in listing
    '''
    #Spacy Phraser Object
    matcher = PhraseMatcher(nlp.vocab)
    
    #convert each phrase to a Doc object:
    phrase_patterns = [nlp(text) for text in phrase_list]

    # Pass each Doc object into matcher (note the use of the asterisk!):
    matcher.add('Cambridge_keywords', None, *phrase_patterns)
    
    all_matches = []
    idx_matches = []

    # Looping through dataset and instantiating Spacy docuemnt
    for x in dataframe.itertuples():
        title = (x.title)
        idx = x.Index
        title = nlp(title)
        matches = matcher(title)

        #For title object, finding the keywords
        for match_id, start, end in matches:
            string_id = nlp.vocab.strings[match_id]  
            #span = keyword that was matched on
            span = title[start:end]  

            #Index that should be dropped - flagged
            idx_matches.append(idx)
            #Keyword that was flagged for that listing
            all_matches.append(span.text)
            
    return idx_matches, all_matches  

### Testing Flagger function

In [56]:
test2 = test.head(25)
test2 = test2.sample(frac= 1)

In [57]:
indeces, matches = filter_listing(test)

In [59]:
print("Total lines in dataframe: ", test.shape[0])
print("Total number of flagged entries by function: ", len(indeces))

Total lines in dataframe:  317
Total number of flagged entries by function:  131
