In [1]:
import re

class PostingsWrapper():
    """
    This postings wrapper creates a link between the index dictionary and the postings list.
    """
    def __init__(self, postings_list, posting, postings_index):
        self.frequency = 1
        self.postings_index = postings_index
        postings_list.append([posting])
        

    def add_posting(self, postings_list, posting):
        """
        
        Adds a posting to the postings list, at correct index according to the term
        Only called if the term has yet not corresponding postings.
        
        :param postings_list: postings list, an attribute of the index.
        :param posting: the posting to be added, extracted from a list of tokens and docids.
        :return: returns nothing
        """
        if posting not in postings_list[self.postings_index]:
            postings_list[self.postings_index].append(posting)
            self.frequency += 1


class index:
    """
    Processes the tweets.csv file or any file containing the same structure, creates
    an inverted index. This is a dictionary terms as keys and an instance of the PostingsWrapper 
    class as value. Also creates a seperate postings list, also as an attribute, which contains
    all tweet ids where each term occured.
    """
    def __init__(self, file):
        """
        :param file: path to tweets.csv file.
        """
        self.data = self.preprocess(file)
        self.index, self.postings_list = self.create_index()

    def preprocess(self, file):
        """
        Opens raw text, spits it into lines comprised of six columns, stores in intermediary
        tab_seperated variable.
        Then proceeds to normalize this while transfering it to data variable. Everything is lowered
        and compared to a regex which desires to only extract usernames and tokens containing 
        only letters. All irrelevant columns are disgarded.
        
        :param file: path to tweets.csv file.
        :return: data, containing tweet IDs with corresponding tweets.
        """
        raw_text = open(file).read()
        tab_seperated = [item.split('\t') for item in raw_text.split('\n')]

        for line in tab_seperated:
            if len(line) == 1:
                tab_seperated.remove(line)

        data = []
        for i in range(len(tab_seperated)):
            data.append([tab_seperated[i][1], tab_seperated[i][4].lower()])

        data = data[:5000]

        for line in data:
            line[1] = re.sub('https?:\/\/[^\s]*|[^a-z\s]', '', line[1])

        return data

    def create_index(self):
        """
        Creates the index and postings list.
        :return: index, a dictionary having a unique term as key and a PostingsWrapper instance
        as value, and postings_list, a large list of lists containing all postings for each unique
        term.
        """

        # We initialize the index, the postings list, and an intermediary tokens_and_ids variable.
        index = {}
        postings_list = []
        tokens_and_ids = []

        # For each line in data, we split each tweet by whitespace into tokens.
        # As a simple preprocessing step we check to make sure that the length of each token is
        # > 0 before appending the token and its tweet ID to the tokens_and_ids list.
        
        for line in self.data[:100]:
            for token in line[1].split():
                if len(token) > 0:
                    tokens_and_ids.append([token, line[0]])

        # We sort our list of all tokens.
        
        tokens_and_ids.sort()

        # The postings_index variable we initialize here will be used as we instantiate
        # PostingsWrapper objects. This integer will enable us to keep track of the index
        # of the postings list where all of a given term's postings are contained.
        
        postings_index = 0
        
        # For each line in tokens_and_ids, we check to make sure it is not already in our index.
        # If it is not we add it, create a corresponding PostingsWrapper Object that will
        # add to the postings list as it is initialized. The PostingsWrapper will also keep track
        # of frequency for us.
        # Having done this we then increment the postings_index variable by 1.
        # If it is found that the term is already present in our index, we simply add the new 
        # posting to its postings list using the PostingsWrapper.add_posting method.
        for line in tokens_and_ids:
            if line[0] not in index.keys():
                index[line[0]] = PostingsWrapper(postings_list, line[1], postings_index)
                postings_index += 1
            else:
                index[line[0]].add_posting(postings_list, line[1])

        return index, postings_list

    def query_one(self, term):
        """
        Queries for a term.
        :param term: query term
        :return: postings list corresponding to query term, or error message if no results.
        """
        try:
            return [posting for posting in self.postings_list[index.index[term].postings_index]]
        except:
            print('No results for query.')
        
            
    def query_and(self, term1, term2):
        """
        Queries for the intersection of two terms.
        :param term1: first term
        :param term2: second term
        :return: returns intersection of postings lists of both terms.
        """
        
        # Here we access the postings list for each term, assign them to variables.
        
        postings1 = self.postings_list[index.index[term1].postings_index]
        postings2 = self.postings_list[index.index[term1].postings_index]
        
        # Here we create iterators to help us compare the two postings lists.
        
        iterpostings1 = iter(postings1)
        iterpostings2 = iter(postings2)
        
        # Here we initialize an empty intersection variable which will (hopefully) be filled.
        intersection = []
        
        
        current1 = next(iterpostings1)
        current2 = next(iterpostings2)
        
        # This is the loop that iterates over the members of each postings list, comparing them.
        # If there is a match it will be added to the intersection.
        while True:

            if current1 == current2:
                intersection.append(current1)
                try:
                    next(iterpostings1)
                    next(iterpostings2)
                except:
                    break
            elif current1 < current2:
                try:
                    next(iterpostings1)
                except:
                    break
            else:
                try:
                    next(iterpostings2)
                except:
                    break
                    
        return intersection
    
index = index('tweets.csv')

In [7]:
print(index.query_one('pcr'),'\n')
print(index.query_one('centers'),'\n')
print(index.query_and('pcr', 'centers'),'\n')


['965672579133566980'] 

['965672579133566980'] 

['965672579133566980'] 



In [None]:
# What do I want to remove?
# numbers
# punctuation
# web adresses
# emoticons
# @ signs
# all of this should be outside of names!!



#regex = '(?<!([^\s\.]))[0-9]*|http:\/\/[^\s]*|[!"#\$%&\(\)\*\+,-\.\/:;<=>\?\[\\\]\^`{\|}~]+'
# regex = '((?<!([^\s\.]))[0-9]*)*(http:\/\/[^\s]*)*([!"#\$%&\(\)\*\+,-\.\/:;<=>\?\[\\\]\^`{\|}~]+)*'
# names = '(?<!([^\s\.]))[0-9]*'
# websites = 'https?:\/\/[^\s]*'
# punctuation = '[!"#$%&\'()*+,-.\/:;<=>?@\[\\\]^_`{|}~'
# newline = '\[newline\]'
# emoji = '[\U00010000-\U0010ffff]'
# everything = '[\U00010000-\U0010ffff]|\[newline\]|[!"#$%&\'()*+,-.\/:;<=>?@\[\\\]^_`{|}~|https?:\/\/[^\s]*|(?<!([^\s\.]))[0-9]*'
# everything_not = '[^@[^\s]*[a-z]*]'