In [2]:
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
from collections import Counter
from typing import Tuple
import gensim
from pathlib import Path
import pickle
import re
import import_ipynb
from src.stopwords import *
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import TruncatedSVD

In [10]:
def do_preprocessing(text: list, labels: list, dataset: str, dataset_type: str) \
        -> Tuple[list, list, list, list]:
    
    haspickledfile = "no"
    prefix = dataset.lower()
         
    if dataset_type =="train":
        if Path("data/"+prefix+"_processed_train_text.pickle").is_file():
            
            haspickledfile = "yes"
            with open("data/"+prefix+"_processed_train_text.pickle", "rb") as myFile:
                processed_text = pickle.load(myFile)
                
            with open("data/"+prefix+"_processed_train_label.pickle", "rb") as myFile:
                processed_label = pickle.load(myFile)
                
            with open("data/"+prefix+"_train_vocabulary.pickle", "rb") as myFile:
                vocabulary = pickle.load(myFile)
                
            with open("data/"+prefix+"_train_raw_tokens.pickle", "rb") as myFile:
                raw_tokens = pickle.load(myFile)
                    
    if dataset_type =="test":
            
        if Path("data/"+prefix+"_processed_test_text.pickle").is_file():
                
            haspickledfile = "yes"
            with open("data/"+prefix+"_processed_test_text.pickle", "rb") as myFile:
                processed_text = pickle.load(myFile)
                
            with open("data/"+prefix+"_processed_test_label.pickle", "rb") as myFile:
                processed_label = pickle.load(myFile)
                
            with open("data/"+prefix+"_test_vocabulary.pickle", "rb") as myFile:
                vocabulary = pickle.load(myFile)
                
            with open("data/"+prefix+"_test_raw_tokens.pickle", "rb") as myFile:
                raw_tokens = pickle.load(myFile)
                    
                
    if haspickledfile =="no":
        
        nltk.download('averaged_perceptron_tagger')
        nltk.download('wordnet')
        
        stop_words = get_stopwords()
                
        if dataset == "MUSE":
            remove_low_freq = False
            do_just_nouns = True
        else:
            remove_low_freq = False
            count_threshold = 30
            do_just_nouns = False
        
        vocabulary = []
        processed_text = []
        processed_label = []
        raw_tokens = []
#         pre_processed_text = []
    
        for i, doc in enumerate(text):

            doc = doc.lower()
            tokens = word_tokenize(doc)
        
            cleaned_tokens = [w for w in tokens if w not in stop_words]
            
            cleaned_tokens = [re.sub('\S*@\S*\s?', '', w) for w in cleaned_tokens]

            # Remove new line characters
            cleaned_tokens = [re.sub('\s+', ' ', w) for w in cleaned_tokens]

            # Remove distracting single quotes
            cleaned_tokens = [re.sub("\'", "", w) for w in cleaned_tokens]
            
            # remove all tokens that are just digits
            tokens = [w for w in tokens if w.isalpha()]

            # remove all tokens that are < 3
            tokens = [w for w in tokens if len(w) > 2]
            
            
            if do_just_nouns:
                #NN: Noun, singular or mass, NNS: Noun, plural, NNP: Proper noun, singular Phrase, NNPS: Proper noun, plural
                cleaned_tokens = [w for (w, pos) in nltk.pos_tag(cleaned_tokens) if pos in ['NN', 'NNP', 'NNS', 'NNPS', 'NOUN']]
                
            cleaned_tokens = [WordNetLemmatizer().lemmatize(w) for w in cleaned_tokens]
                

            if len(cleaned_tokens) == 0:
                continue
            #print("tokens - ",tokens)
            
            

            processed_text.append(cleaned_tokens)
            processed_label.append(labels[i])
        
            vocabulary.extend(cleaned_tokens)
            raw_tokens.append(tokens)
            
                  
            #print(processed_text)
        print(len(processed_text))
        print(len(processed_label))
        
        if remove_low_freq:
        # remove low-frequency terms

            temp = []
            for d in processed_text:
                temp.extend(d)
                counter = Counter(temp)

            docs_threshold = []
            labels_threshold = []
            vocab_threshold = []
        
            for i, d in enumerate(processed_text):

                d_threshold = [w for w in d if counter[w] > count_threshold]
                if len(d_threshold) > 0:

                    labels_threshold.append(processed_label[i])
                    docs_threshold.append(d_threshold)
                    vocab_threshold.extend(d_threshold)

            print("vocab with out threshold len: " + str(len(vocabulary)))
            print("vocab threshold len: " + str(len(vocab_threshold)))
            processed_text = docs_threshold
            vocabulary = vocab_threshold
            processed_label = labels_threshold
            
        
        vocabulary = sorted(list(set(vocabulary)))
            
        if dataset_type =="train":
            
            with open("data/"+prefix+"_processed_train_text.pickle", "wb") as myFile:
                pickle.dump(processed_text, myFile)
                
            with open("data/"+prefix+"_processed_train_label.pickle", "wb") as myFile:
                pickle.dump(processed_label, myFile)
                
            with open("data/"+prefix+"_train_vocabulary.pickle", "wb") as myFile:
                pickle.dump(vocabulary, myFile)
                
            with open("data/"+prefix+"_train_raw_tokens.pickle", "wb") as myFile:
                pickle.dump(raw_tokens, myFile)
                
        elif dataset_type =="test":
            
            with open("data/"+prefix+"_processed_test_text.pickle", "wb") as myFile:
                pickle.dump(processed_text, myFile)
                
            with open("data/"+prefix+"_processed_test_label.pickle", "wb") as myFile:
                pickle.dump(processed_label, myFile)
                
            with open("data/"+prefix+"_test_vocabulary.pickle", "wb") as myFile:
                pickle.dump(vocabulary, myFile)
                
            with open("data/"+prefix+"_test_raw_tokens.pickle", "wb") as myFile:
                pickle.dump(raw_tokens, myFile)

    
    print(len(processed_text))
    print(len(processed_label))
    assert len(processed_text) == len(processed_label)
    return processed_text, processed_label, vocabulary, raw_tokens


In [None]:
def make_bigrams(texts):
    bigram = gensim.models.Phrases(texts, min_count=2, threshold=100) # higher threshold fewer phrases.
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return([bigram[doc] for doc in texts])

In [None]:
def make_trigrams(texts):
    trigram = gensim.models.Phrases(bigram[texts], threshold=100)  
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

In [None]:
def get_tfidf(texts):
    train_sentences = [' '.join(text) for text in texts]
    vect =TfidfVectorizer(stop_words="english",max_features=100)
    vect_text=vect.fit_transform(train_sentences)
    
    idf=vect.idf_
    dd=dict(zip(vect.get_feature_names(), idf))
    l=sorted(dd, key=(dd).get)
    
    return vect, vect_text