In [1]:
import pandas as pd
import re, csv, collections, itertools
import numpy as np
import stanfordnlp
from polyglot.text import Text, Word
import torch
from sklearn.metrics import f1_score

config = {
    'use_gpu' : True,
    'processors': 'tokenize,pos,lemma',  # Comma-separated list of processors to use
    'lang': 'en',  # Language code for the language to build the Pipeline in
    'tokenize_model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt',
    # Processor-specific arguments are set with keys "{processor_name}_{argument_name}"
    'pos_model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt',
    'pos_pretrain_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt',
    'lemma_model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt'
}

In [23]:
import stanfordnlp
stanfordnlp.download('en')

Using the default treebank "en_ewt" for language "en".
Would you like to download the models for: en_ewt now? (Y/n)


 n


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'0.0M'

In [4]:
torch.cuda.empty_cache()
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'0.0M'

In [6]:
def read_data(inputf, wiegand_dict):
    t_l = []
    p_l = []
    offense = pd.read_csv(inputf, delimiter="\t", header=0, usecols=['id', 'tweet', 'subtask_a'])
    clean_na = offense.dropna()

    msg_tokenized_stemmed = tokenize_and_stem_msg(clean_na)
    wiegand_en = stem_wiegand(wiegand_dict)

 
    pred_v5, pred_tok = prediction_v5(msg_tokenized_stemmed,wiegand_en)
    
    
    with open("/root/standfordnlp/offenseval-training-v2.tsv") as f:
        for line in f:
            values = line.split("\t")
            if not str(values[2]).startswith('subtask_a'):
                  True_Label.append(values[2])
    
    
    t_l = np.array([[x] for x in True_Label])
    values_5 = list(pred_v5.values())
    #Predicted_Label = value_5
    p_l = np.array([[x] for x in values_5]) 
    print(p_l.shape)
    f = f1_score(t_l, p_l, average='macro')
    print("F1 Score: ")
    print(f)
    #values_tok_6 = list(pred_tok.values())
    #values_tok_l = list(pred_l.values())
    pred_df_5 = pd.DataFrame({'pred_labels': values_5})
    pred_df = pd.DataFrame(pred_tok)
    final_format_v5 = offense.join(pred_df_5, how='outer')

    outfile5 = "hurt_key_extra_en_8.tsv"
    output = "tokenized_en_8.tsv"
    pred_df.to_csv(output, sep = "\t", index = False)
    final_format_v5.to_csv(outfile5, sep="\t", index=False)

In [7]:
def tokenize_and_stem_msg(offense_df):

    msg = offense_df['tweet']
    pre_proc_data = collections.defaultdict(list)
    nlp = stanfordnlp.Pipeline(**config)
    for idx, elem in enumerate(msg):
        doc = nlp(elem)  # Run the pipeline on input text
        token_text = [word.text for sent in doc.sentences for word in sent.words]
        token_lemma = [word.lemma for sent in doc.sentences for word in sent.words]
        token_pos = [word.upos for sent in doc.sentences for word in sent.words]
        final = list(zip(token_lemma, token_pos, token_text))
        

        pre_proc_data[idx] = final

        
   
    print("Done lemmatization...")
    return pre_proc_data

In [8]:
def stem_wiegand(keyf):

    stemmed_swear_en = []

    with open(keyf) as f:
        for line in f:
            line_splitted = line.strip().split("\t")

            if float(line_splitted[1]) >= 0.75:
                target_tok = line_splitted[0].split("_")[0]
                if target_tok not in stemmed_swear_en:
                    stemmed_swear_en.append(target_tok)

    return stemmed_swear_en

In [9]:
def prediction_v5(tweet_tok_pos_stem, wiegand_en):
    
    
    seen = set()
    predicted_dict = {}
    predicted_tok = []
    #predicted_tok_l = {}
    for k, v in tweet_tok_pos_stem.items():
        for elem in v:
            #print("this is elem   ")
            #print(elem)
            tok, pos, tok_word = elem
            #print("this is tok, pos , tok_l" +tok + " "+ pos +" "+ tok_word)
            if tok in wiegand_en:
                predicted_dict[k] = "OFF"
                predicted_tok.append(str(tok_word)+ "\t"+ "OFF")
                #predicted_tok[k] = tok_word
                #print("this is offense " + tok+ " " + tok_word)
            #else:
             #   predicted_tok[k] = tok_word
              #  predicted_tok_l[k] = "NOT"
                #print(tok)
               # break
                
    for k, v in tweet_tok_pos_stem.items():
            if k not in predicted_dict:
                predicted_dict[k] = "NOT"
                #predicted_tok_l[k] = "NOT"
                #predicted_tok[k] = 
            #if k not in predicted_tok_l:
             #   predicted_tok_l[k] = "NOT"
                
    for k, v in tweet_tok_pos_stem.items():
            for elem in v:
                tok, pos, tok_word = elem
                if tok_word not in predicted_tok:
                    if tok_word not in seen:
                        seen.add(tok_word)
                        predicted_tok.append(str(tok_word)+ "\t"+ "NOT")
                
    
    
    return predicted_dict, predicted_tok

In [10]:

if __name__ == '__main__':

    """
    activate miniconda: source./miniconda/bin/activate ; conda init
    activate venv : conda activate offenseval_stanford
    run stanford nlp for EN
    """
    True_Label = []
    Predicted_Label = []
    training = "/root/standfordnlp/offenseval-training-v2.tsv"   #available_sudha
    wiegand_dict = "/root/standfordnlp/wiegand_expanded"
    read_data(training, wiegand_dict)
    
    

Use device: gpu
---
Loading: tokenize
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
Done loading processors!
---
Done lemmatization...
(13240, 1)
F1 Score: 
0.49179346905290283
