In [99]:
class PostingsWrapper():
    """
    This postings wrapper creates a link between the index dictionary and the postings list.
    """
    def __init__(self, postings_list, posting, postings_index):
        self.frequency = 1
        self.postings_index = postings_index
        postings_list.append([posting])
        

    def add_posting(self, postings_list, posting):
        """
        
        Adds a posting to the postings list, at correct index according to the term
        Only called if the term has yet not corresponding postings.
        
        :param postings_list: postings list, an attribute of the index.
        :param posting: the posting to be added, extracted from a list of tokens and docids.
        :return: returns nothing
        """
        if posting not in postings_list[self.postings_index]:
            postings_list[self.postings_index].append(posting)
            self.frequency += 1

de_dic = []
with open('german.dic', 'r', encoding='latin-1') as f:
    for row in f:
        if len(row) >1:
            de_dic.append(row.strip().lower())
en_dic = []
with open('english.dic', 'r',) as f:
    for row in f:
        en_dic.append(row.strip().lower())

from nltk.stem import WordNetLemmatizer as wnl

class index:
    """
    Processes the tweets.csv file or any file containing the same structure, creates
    an inverted index. This is a dictionary terms as keys and an instance of the PostingsWrapper 
    class as value. Also creates a seperate postings list, also as an attribute, which contains
    all tweet ids where each term occured.
    """
    def __init__(self, file):
        """
        :param file: path to tweets.csv file.
        """
        self.data, self.data_index, self.data_dic = self.preprocess(file)
        self.index, self.postings_list = self.create_index()
        self.top_de, self.top_en = self.top_freq() 

    def preprocess(self, file):
        """
        Opens raw text, spits it into lines comprised of six columns, stores in intermediary
        tab_seperated variable.
        Then proceeds to normalize this while transfering it to data variable. Everything is lowered
        and compared to a regex which desires to only extract usernames and tokens containing 
        only letters. All irrelevant columns are disgarded.
        
        :param file: path to tweets.csv file.
        :return: data, containing tweet IDs with corresponding tweets 
        and a dictionary of all terms and original text
        """
        import re
        import string
        
        raw_text = open(file).read()
        tab_seperated = [item.split('\t') for item in raw_text.split('\n')]

        for line in tab_seperated:
            if len(line) == 1:
                tab_seperated.remove(line)

        data = []
        data_index = {}
        for i in range(len(tab_seperated)):
            data.append([tab_seperated[i][1], tab_seperated[i][4].lower()])
            data_index[tab_seperated[i][1]] = tab_seperated[i][4]
            
        data = data
        contractions = ["it's", "he's","she's","that's", "what's", "there's",\
                        "[newline]", "'m", "'ve","n't", "'ll","'re", "won't", "'d", "'s"]
        fixes = ["it is", "he is","she is","that is", "what is", "there is",\
                 " ", " am", " have", " not", " will", " are", "will not", " would", ""]
        for line in data:
            for i in range(len(contractions)):
                if contractions[i] in line[1]:
                    line[1]= line[1].replace(contractions[i], fixes[i])
            line[1] = re.sub(r'[^\w\s]', ' ' , line[1])
            line[1] = re.sub(r'[0-9].*\s', ' ' , line[1])
            line[1] = re.sub(r'https?.+\s', ' ' , line[1])
            line[1] = re.sub(r'[\W].+[^\W\s]+|[^ ]+\.[^ ]+ |[^a-zA-Zäöüß\s]+ \
                             | \d+|[^\w\s]+.[^\W\s]+| https?','', line[1])
            #line[1] = re.sub('https?:\/\/[^\s]*|[^a-z\s]', '', line[1])
        data_dic = {}
        for row in data:
            data_dic[row[0]] = row[1]

        return data, data_index, data_dic

    def create_index(self):
        """
        Creates the index and postings list.
        :return: index, a dictionary having a unique term as key and a PostingsWrapper instance
        as value, and postings_list, a large list of lists containing all postings for each unique
        term.
        """

        # We initialize the index, the postings list, and an intermediary tokens_and_ids variable.
        index = {}
        postings_list = []
        tokens_and_ids = []

        # For each line in data, we split each tweet by whitespace into tokens.
        # As a simple preprocessing step we check to make sure that the length of each token is
        # > 0 before appending the token and its tweet ID to the tokens_and_ids list.
        from nltk.corpus import stopwords
        stop_en = set(stopwords.words('english'))
        stop_germ = set(stopwords.words('german'))
        
        for line in self.data:
            for token in [x for x in line[1].split()  if (not x in stop_en) and (not x in stop_germ)]:
                if len(token) > 1:
                    tokens_and_ids.append([token, line[0]])

        # We sort our list of all tokens.
        
        tokens_and_ids.sort()

        # The postings_index variable we initialize here will be used as we instantiate
        # PostingsWrapper objects. This integer will enable us to keep track of the index
        # of the postings list where all of a given term's postings are contained.
        
        postings_index = 0
        
        # For each line in tokens_and_ids, we check to make sure it is not already in our index.
        # If it is not we add it, create a corresponding PostingsWrapper Object that will
        # add to the postings list as it is initialized. The PostingsWrapper will also keep track
        # of frequency for us.
        # Having done this we then increment the postings_index variable by 1.
        # If it is found that the term is already present in our index, we simply add the new 
        # posting to its postings list using the PostingsWrapper.add_posting method.
        for line in tokens_and_ids:
            if line[0] not in index.keys():
                index[line[0]] = PostingsWrapper(postings_list, line[1], postings_index)
                postings_index += 1
            else:
                 index[line[0]].add_posting(postings_list, line[1])

        return index, postings_list
            
    def get_frequency(self, term):
        """
        Pulls frequency from wrapper of term
        """
        try:
            return index.index[term].frequency
        except:
            print('Term not found.')
    
    def All_frequencies(self):
        '''
        return the term and frequencies in descending order
        '''
        frequencies = []
        for term in self.index.keys():
            frequencies.append((self.index[term].frequency, term))
        return sorted(frequencies)[::-1]
    def top_freq(self):
        freq_de = []
        freq_en = []
        frequencies = self.All_frequencies()[:200]
        for i,j in frequencies:
            if self.is_language(j) == 'german':
                freq_de.append(j)
            else:
                freq_en.append(j)
        return freq_de, freq_en
    
    def is_language(self,term):
        from nltk.corpus import stopwords
        import re
        stop_en = set(stopwords.words('english'))
        stop_germ = set(stopwords.words('german'))
        
        g_score = 0
        en_score = 0
        for post in self.postings_list[self.index[term].postings_index]:
            for i in self.data_index[post].lower().strip().split():
                if i in stop_germ:
                    g_score += 1
                if i in stop_en:
                    en_score += 1
#         print('DE:', g_score,'\t','EN:',en_score)
        if g_score>en_score:
            return 'german'
#             print('probability:', (g_score/(g_score+en_score)))
#             print("german")
        elif g_score<en_score:
            return 'english'
#             print('probability:', (en_score/(g_score+en_score)))
#             print('english')
        else:
            return 'unsure'
    
    def language(self,post):
        from nltk.corpus import stopwords
        import re
        contractions = ["it's", "he's","she's","that's", "what's", "there's",\
                        "[newline]", "'m", "'ve","n't", "'ll","'re", "won't", "'d", "'s"]
        fixes = ["it is", "he is","she is","that is", "what is", "there is",\
                 " ", " am", " have", " not", " will", " are", "will not", " would", ""]
        stop_en = set(stopwords.words('english'))
        stop_germ = set(stopwords.words('german'))
        de_char = ['ä','ö','ü','ß']
        g_score = 0
        en_score = 0
        
        for char in self.data_index[post].lower():
            if char in de_char:
                g_score +=1

        text = self.data_index[post].lower()
        for x in range(len(contractions)):
            text = text.replace(contractions[x], fixes[x])
        text = re.sub(r'[^\w\s]','', text)
        for i in text.strip().split():    
            if i in stop_germ or i in self.top_de:
                g_score += 1
            if i in stop_en or i in self.top_en:
                en_score += 1
        #print(post, 'DE:', g_score,'\t','EN:',en_score)
        if g_score>en_score:
            return 'german'
#             print('probability:', (g_score/(g_score+en_score)))
#             print("german")
        elif en_score>g_score:
            return 'english'
#             print('probability:', (en_score/(g_score+en_score)))
#             print('english')
        else:
            return None
    
    def get_misspells(self):
        #from nltk.stem import GermanWortschatzLemmatizer as gwl
        terms = sorted(self.index.keys())
        de = []
        en = []
        for x in terms[:200]:
            term = ''
            for i in range(len(x)):
                try:
                    if x[i] != [i+2]:
                        term += x[i]
                except:
                    term += x[i]
            g_count = 0
            e_count = 0
            e_posts = []
            g_posts = []
            tags = ['n','v','a','s','r']
            en_lemma = []
            for tag in tags:
                en_lemma.append(wnl.lemmatize(wnl,word=term, pos=tag))
            en_lemma = set(en_lemma)
            de_lemma = term
            for post in self.postings_list[self.index[term].postings_index]:
                if self.language(post) == 'german':
                    ## Apply Lematizer to word here
                    ## get german misspelling count
                    if de_lemma not in de_dic:
#                         for i in en_lemma:
#                             if i in en_dic:
#                                 continue    
                        g_count += 1
                        g_posts.append(post)
                elif self.language(post) == 'english':
                    ## Apply Lematizer to word here
                    ## get german misspelling count
                    if len([i for i in en_lemma if i in en_dic]) == 0:
#                     if de_lemma not in de_dic:
                        e_count += 1
                        e_posts.append(post)
                else:
                    lang = self.is_language(term)
                    if lang == None:
                        continue
                    if lang == 'german':
                        ## Apply Lematizer to word here
                        ## get german misspelling count    
                        if de_lemma not in de_dic:
#                             for i in en_lemma:
#                                 if i in en_dic:
#                                     continue
                            g_count += 1
                            g_posts.append(post)
                    else:
                        ## Apply Lematizer to word here
                        ## get german misspelling count
                        if len([i for i in en_lemma if i in en_dic]) == 0:
#                             if de_lemma not in de_dic:
                            e_count += 1
                            e_posts.append(post)
                            
            if g_count > 0:
                de.append((g_count, term, g_posts))
            if e_count > 0:
                en.append((e_count, term, e_posts))
        return sorted(de)[::-1], sorted(en)[::-1]
    
    def en_damerau(self, term):
        alphabet = "abcdefghijklmnopqrstuvwxyz"
        term = term.lower()
        splits     = [(term[:i], term[i:])    for i in range(len(term) + 1)]
        deletes    = [L + R[1:]               for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
        replaces   = [L + c + R[1:]           for L, R in splits if R for c in alphabet]
        inserts    = [L + c + R               for L, R in splits for c in alphabet]
        return set(deletes + transposes + replaces + inserts)
    
    def de_damerau(self, term):
        alphabet = "abcdefghijklmnopqrstuvwxyzäöüß"
        term = term.lower()
        splits     = [(term[:i], term[i:])    for i in range(len(term) + 1)]
        deletes    = [L + R[1:]               for L, R in splits if R]
        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
        replaces   = [L + c + R[1:]           for L, R in splits if R for c in alphabet]
        inserts    = [L + c + R               for L, R in splits for c in alphabet]
        return set(deletes + transposes + replaces + inserts)    

    def en_suggested(self,term):
        suggestions = self.en_damerau(term)
        suggested = []
        for i in suggestions:
            tags = ['n','v','a','s','r']
            lemmas = []
            for tag in tags:
                lemmas.append(wnl.lemmatize(wnl,word=i, pos=tag))
            lemmas = set(lemmas)
            for j in lemmas:
                if j in en_dic:
                    suggested.append(i)
        best = []
        count = 0
        for k in set(suggested):
            counter = 0
            for h in k:
                if h in k:
                    counter +=1
            if counter >= count and counter < len(i):
                count = counter
                best.append(k)
        return best
#             if i in en_dic:
#                 suggested.append(i)
#         return suggested
    
    def de_suggested(self,term):
        suggestions = self.de_damerau(term)
        suggested = []
        for i in suggestions:
            if i in de_dic:
                suggested.append(i)
        return suggested
        
    def spell_check(self, lst):
        de_dic = []
        with open('german.dic', 'r', encoding='latin-1') as f:
            for row in f:
                if len(row) >1:
                    de_dic.append(row.strip().lower())
        en_dic = []
        with open('english.dic', 'r',) as f:
            for row in f:
                en_dic.append(row.strip().lower())
        for post in lst:
            if self.language(post) == 'german':
                print('German')
                misspells = [i for i in self.data_dic[post].split() if i not in de_dic] 
            else:
                print('English')
                misspells = [i for i in self.data_dic[post].split() if i not in en_dic]
            print("Number of misspellings:", len(misspells))
            print(misspells)
        return
    
    def query_one(self, term):
        """
        Queries for a term.
        :param term: query term
        :return: postings list corresponding to query term, or error message if no results.
        """
        try:
            return self.postings_list[index.index[term].postings_index]
        except:
            print('No results for query.')
#         try:
#             for posting in self.postings_list[index.index[term].postings_index]:
#                 print(posting, self.data_index[posting], '\n')
#         except:
#             print('No results for query.')


    def query_and(self, term1, term2):
        """
        Queries for the intersection of two terms.
        :param term1: first term
        :param term2: second term
        :return: returns intersection of postings lists of both terms.
        """
        
        # Here we compare the two lists and create iterators to help us compare the two postings lists
        def And(post1,post2):
            if len(post1) < len(post2):    
                iterpost1 = iter(post1)
                iterpost2 = iter(post2)
            else:
                iterpost1 = iter(post2)
                iterpost2 = iter(post1)
                
        # Here we initialize an empty intersection variable which will (hopefully) be filled.
            intersection = []
            
            current1 = next(iterpost1)
            current2 = next(iterpost2)
        # This is the loop that iterates over the members of each postings list, comparing them.
        # If there is a match it will be added to the intersection.
            while True:
                if current1 == current2:
                    intersection.append(current1)
                    try:
                        current1 = next(iterpost1)
                        current2 = next(iterpost2)
                    except:
                        break
                elif current1 < current2:
                    try:
                        current1 = next(iterpost1)
                    except:
                        break
                else:
                    try:
                        current2 = next(iterpost2)
                    except:
                        break
            # Here we print each text and id number found in intersection
#             if len(intersection) != 0:
#                 for i in intersection:
#                     print( i, self.data_index[i], '\n')
                    
#             else:
#                 print('No results for query.')
            # Here we access the postings list for each term, assign them to variables.
#         if type(term1) == list:
#             try:
#                 postings2 = self.postings_list[index.index[term2].postings_index]
#                 return And(term1, postings2)
#             except:
#                 return None
        try:
            postings1 = self.postings_list[index.index[term1].postings_index]
            postings2 = self.postings_list[index.index[term2].postings_index]
            return And(postings1, postings2)
        except:
            print('Error: 1 or more terms not found.')
#     def query_three(self, term1, term2, term3):
#         self.query_and(self.query_and(term1,term2),term3)
    

In [78]:
renou_index = index('tweets.csv')

In [79]:
import pickle
with open('renou_index5.pkl', 'wb') as f:
    pickle.dump(renou_index, f)

In [101]:
import pickle
with open('renou_index5.pkl', 'rb') as f:
    index = pickle.load(f)

In [102]:
a = 'frog'
new_a = ''
for i in range(len(a)):
    try:
        if a[i] != a[i+2]:
            new_a += a[i]
    except:
        new_a += a[i]
print(new_a)

frog


In [103]:
de_miss, en_miss = index.get_misspells()

In [71]:
#de_miss

In [46]:
index.language('1001528572438249472')

'german'

In [50]:
index.data_index['980858193700995072']

'Acun "Alp atamıyor" diyemez ki...Biliyor çünkü Alp onun sesini ayırt edeceğini. Kıyamaz ki #survivor2018 #hilmur https://t.co/FOLgUgSGR5'

In [104]:
en_miss

[(12,
  'adhd',
  ['1022985712432566272',
   '959830428550074368',
   '963160559159046145',
   '966066703577726976',
   '966380202002604032',
   '966449273633112070',
   '973254925269553152',
   '979830050273902592',
   '983421236444368896',
   '983836480123486209',
   '985643221874339840',
   '987415731779375104']),
 (10,
  'ahhh',
  ['1002329630118903808',
   '1005361121199607809',
   '1007170534033981440',
   '990008920566648832',
   '991373407487975424',
   '995451547873173504',
   '995758846219051010',
   '997920017340649474',
   '997926777036201984',
   '997964501587722240']),
 (6,
  'ahh',
  ['1005131932416598016',
   '1017539553207472128',
   '994303222549549057',
   '996394375998558208',
   '998397003053838342',
   '999776002355224577']),
 (4,
  'abschlussplakat',
  ['1003297214318465024',
   '1003373516488626176',
   '1008591002729435138',
   '1009414480076435456']),
 (3,
  'ahs',
  ['976916199249534977', '976916207814217730', '976934206478708736']),
 (3,
  'aafa',
  ['984473

In [233]:
index.language('1002324883651678208')

'german'

In [234]:
index.data_index['1002324883651678208']

'Actually kann ich jeden verstehen der mit League aufgehört habe..[NEWLINE]Wenn in der CS gesagt wird, dass deine Mutter hoffentlich einen Tumor hat dann weiß ich wirklich nicht was los ist.[NEWLINE]@RiotSupport @riotgames Wieso ist das Reportsystem einfach aus 2001?'

In [84]:
index.en_suggested('aah')

['dah',
 'wah',
 'aal',
 'auh',
 'bah',
 'mah',
 'aas',
 'aam',
 'ash',
 'pah',
 'sah',
 'ach',
 'rah',
 'aha',
 'hah',
 'yah']

In [42]:
'activin' in en_dic

True

In [29]:
from nltk.stem import WordNetLemmatizer as wnl

In [30]:
tags = ['n','v','r','a','s']
for tag in tags:
    print(wnl.lemmatize(wnl,word='accelerates', pos=tag))

accelerates
accelerate
accelerates
accelerates
accelerates


In [31]:
test = index.All_frequencies()[:200]

In [32]:
freq_de = []
freq_en = []
for i,j in test:
    if index.is_language(j) == 'german':
        freq_de.append(j)
    else:
        freq_en.append(j)
    

In [16]:
len(index.index)

10003

In [52]:
index.de_suggested('agenturleute')

[]