In [None]:
import re
import string
from nltk.corpus import stopword

In [125]:
class Node:
    """
    A Node class for use in a linked list.
    """
    
    def __init__(self, value=None):
        
        self.value = value
        self.next = None

class LinkedList:
    """
    A class to efficiently link our postings together.
    """
    
    def __init__(self, value):
        
        self.head = Node(value)


    def at_end(self, new_value):
        """
        Adds a new node at end of list.
        """        
        new_node = Node(new_value)
        
        if self.head is None:
            self.head = new_node
            
            return
        
        node = self.head
        
        while(node.next):
            node = node.next
            
        node.next = new_node

# Print the linked list

    def to_list(self):
        """
        Get the LinkedList as a normal list.
        :return as_list: LinkedList as normal list.
        """
        
        listval = self.head
        as_list = []
        
        while listval is not None:
            as_list.append(listval.value)
            listval = listval.next
        
        return as_list

In [132]:
class Index:
    
    def __init__ (self, file):
        
        self.index, self.postings_list, self.tweets_by_id = self.create_index(file)
        
    def clean(self, token):
        """
        Cleans each token before adding it to index.
        :param token: the token to be cleaned.
        :return: token
        """
        
        # Here we expand sequences of contractions. This will later help with language detection.
        
        contractions = ["it's", "he's","she's","that's", "what's", "there's",\
                        "[newline]", "'m", "'ve","n't", "'ll","'re", "won't", "'d", "'s"]
        
        fixes = ["it is", "he is","she is","that is", "what is", "there is",\
                 " ", " am", " have", " not", " will", " are", "will not", " would", ""]
        
        for i in range(len(contractions)):
            if contractions[i] in token:
                token = token.replace(contractions[i], fixes[i])
                
        # Here we apply a series of regexes to get rid of URLS, emoticons, digits, etc. It could be cool to 
        # write a comment for each one, describing what it seeks to remove. 
        
        token = re.sub(r'[^\w\s]', ' ' , token)
        token = re.sub(r'[0-9].*\s', ' ' , token)
        token = re.sub(r'https?.+\s', ' ' , token)
        token = re.sub(r'[\W].+[^\W\s]+|[^ ]+\.[^ ]+ |[^a-zA-Zäöüß\s]+ \
                         | \d+|[^\w\s]+.[^\W\s]+| https?','', token)
        
        # Here we perform a check to make sure that at least one non space character has survived the
        # cleaning process.
        
        if len(token) == 0 or token.isspace():
            raise Exception('empty token') 
        
        # Finally, we return the lowercase'd token.
        
        return token.lower()
        
    def create_index(self, file):
        """
        Method to create index, postings list and list of tweets with associated IDs.
        :param: file. path to tweets.csv file.
        :return index: a dictionary of sorted terms, of which the values a list containing
        frequency and a pointer to their postings list.
        :return postings_list: contains all docids accompanying each term in our index.
        :return tweets_by_id: a dictionary of tweet ids with accompanying tweets.
        """
        tweets_by_id_lst = []
        tokens_by_id = []
        
        with open(file, 'r') as f:
            read = f.read()
        
            for tab_split in [line.split('\t') for line in read.split('\n')[:1000]]:
                tweets_by_id_lst.append([tab_split[1], tab_split[-1]])
        
                for token in tab_split[-1].split():
                    try:
                        tokens_by_id.append([self.clean(token), tab_split[1]])
                    except:
                        break
        
        tokens_by_id = sorted(tokens_by_id)
        tweets_by_id = {}
        
        for [ID, tweet] in sorted(tweets_by_id_lst):
            tweets_by_id[ID] = tweet
        
        index = {}
        postings_list = []
        
        for token,ID in tokens_by_id:
            
            if token not in index.keys():
                #postings_list.append([ID])
                index[token] = [1, LinkedList(ID)]
                
            elif ID not in index[token][1].to_list():
                index[token][0]+=1
                index[token][1].at_end(ID)
            
#             if token not in index.keys():
#                 postings_list.append([ID])
#                 index[token] = [1, postings_list[-1]]
                
#             elif ID not in index[token][1]:
#                 index[token][0]+=1
#                 index[token][1].append(ID)
        
        return index, postings_list, tweets_by_id
    
    def get_frequency(self, term):
        """
        Get number of occurences of a certain term.
        :param term: term for frequency query.
        :return int: frequency as int.
        """
        try:
            return index.index[term][0]
        except:
            print('Term not found.')

    def get_all_frequencies(self):
        '''
        Get frequencies of all terms in index.
        :return frequencies: list of tuples containing frequency, term, sorted in descending order.
        '''
        frequencies = []
        
        for term in self.index.keys():
            frequencies.append((self.get_frequency(term), term))
            
        return sorted(frequencies)[::-1]
            
                
            
            
                
            

In [133]:
index = Index('tweets.csv')