In [1]:
import pandas as pd
import emoji
import unidecode
import contractions
import tqdm
import re
import nltk

from autocorrect import Speller 
from emot.emo_unicode import UNICODE_EMOJI,EMOTICONS_EMO
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer 

from sklearn.model_selection import train_test_split

import pandas as pd

from flair.data import Corpus
from flair.datasets import CSVClassificationCorpus
from flair.datasets import TREC_6
from flair.embeddings import TransformerDocumentEmbeddings, DocumentPoolEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from flair.data import Sentence
from flair.models import TARSClassifier
from flair.embeddings import FlairEmbeddings, PooledFlairEmbeddings
from flair.data import Dictionary
from flair.embeddings import FlairEmbeddings
from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorpus
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay, f1_score, precision_score, recall_score

import matplotlib.pyplot as plt


In [5]:
# Converting emojis to words
def convert_emojis(text):
    for emot in UNICODE_EMOJI:
        text = text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",","").replace(":","").split()))
        return text
# Converting emoticons to words    
def convert_emoticons(text):
    for emot in EMOTICONS_EMO:
        text = re.sub(u'\('+emot+'\)', "_".join(EMOTICONS_EMO[emot].replace(",","").split()), text)
        return text

#for now replaced emojis and emoticons instead of removing them
def remove_emoticons(text):
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    return emoticon_pattern.sub(r'', text)

def load_dict_smileys():
    
    return {
        ":‑)":"smiley",
        ":-]":"smiley",
        ":-3":"smiley",
        ":->":"smiley",
        "8-)":"smiley",
        ":-}":"smiley",
        ":)":"smiley",
        ":]":"smiley",
        ":3":"smiley",
        ":>":"smiley",
        "8)":"smiley",
        ":}":"smiley",
        ":o)":"smiley",
        ":c)":"smiley",
        ":^)":"smiley",
        "=]":"smiley",
        "=)":"smiley",
        ":-))":"smiley",
        ":‑D":"smiley",
        "8‑D":"smiley",
        "x‑D":"smiley",
        "X‑D":"smiley",
        ":D":"smiley",
        "8D":"smiley",
        "xD":"smiley",
        "XD":"smiley",
        ":‑(":"sad",
        ":‑c":"sad",
        ":‑<":"sad",
        ":‑[":"sad",
        ":(":"sad",
        ":c":"sad",
        ":<":"sad",
        ":[":"sad",
        ":-||":"sad",
        ">:[":"sad",
        ":{":"sad",
        ":@":"sad",
        ">:(":"sad",
        ":'‑(":"sad",
        ":'(":"sad",
        ":‑P":"playful",
        "X‑P":"playful",
        "x‑p":"playful",
        ":‑p":"playful",
        ":‑Þ":"playful",
        ":‑þ":"playful",
        ":‑b":"playful",
        ":P":"playful",
        "XP":"playful",
        "xp":"playful",
        ":p":"playful",
        ":Þ":"playful",
        ":þ":"playful",
        ":b":"playful",
        "<3":"love"
    }

def accented_characters_removal(text):
    # this is a docstring
    """
    The function will remove accented characters from the 
    text contained within the Dataset.
       
    arguments:
        input_text: "text" of type "String". 
                    
    return:
        value: "text" with removed accented characters.
        
    Example:
    Input : Málaga, àéêöhello
    Output : Malaga, aeeohello    
        
    """
    # Remove accented characters from text using unidecode.
    # Unidecode() - It takes unicode data & tries to represent it to ASCII characters. 
    text = unidecode.unidecode(text)
    return text

# The code for spelling corrections
def spelling_correction(text):
    ''' 
    This function will correct spellings.
    
    arguments:
         input_text: "text" of type "String".
         
    return:
        value: Text after corrected spellings.
        
    Example: 
    Input : This is Oberois from Dlhi who came heree to studdy.
    Output : This is Oberoi from Delhi who came here to study.
      
    
    '''
    # Check for spellings in English language
    spell = Speller(lang='en')
    Corrected_text = spell(text)
    return Corrected_text

def clean_ascii(text):
    # function to remove non-ASCII chars from data
    return ''.join(i for i in text if ord(i) < 128)

# function to remove numbers
def remove_numbers(text):
    # define the pattern to keep
    pattern = r'[^a-zA-z.,!?/:;\"\'\s]' 
    return re.sub(pattern, '', text)

def expand_contractions(text):
  return contractions.fix(text)

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatization(text):
    """This function converts word to their root words 
       without explicitely cut down as done in stemming.
    
    arguments:
         input_text: "text" of type "String".
         
    return:
        value: Text having root words only, no tense form, no plural forms
        
    Example: 
    Input : text reduced 
    Output :  text reduce
    
   """
    # Converting words to their root forms
    #for now doing in context of verb
    lemma = [lemmatizer.lemmatize(w,'v') for w in w_tokenizer.tokenize(text)]
    lemma = " ".join(lemma)
    return lemma

nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     /export/home/aneezahm001/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /export/home/aneezahm001/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [36]:
def preprocess_tweet(tweet):
    text = tweet.lower()
   
    # Code to remove the Hashtags from the text
    text=re.sub(r'\B#\S+','',text)

    # print(text)
    # Code to remove the links from the text
    text=re.sub(r"http\S+", "",text)
    text=re.sub(r"\ [A-Za-z]*\.com", " ",text)
    # Code to substitute the multiple spaces with single spaces
    text=re.sub(r'\s+', ' ', text, flags=re.I)
    # Remove the twitter handlers
    text=re.sub('@[^\s]+','',text)
    # print(text)

    # text=convert_emoticons(text)
    SMILEY = load_dict_smileys()  
    words = text.split()
    reformed = [SMILEY[word] if word in SMILEY else word for word in words]
    text = " ".join(reformed)

    # text=convert_emojis(text)
    #can check which version is better
    text = emoji.demojize(text)
    # print(text)
    text=accented_characters_removal(text)
    text=expand_contractions(text)

    # Code to remove the Special characters from the text 
    text=' '.join(re.findall(r'\w+', text))
    # text=remove_numbers(text)

    # print(text)
    # text=spelling_correction(text)
    text=lemmatization(text)
    # text=clean_ascii(text)

    return text

In [40]:
train = pd.read_csv("/export/home/aneezahm001/IR/data/hand-labelled-old-new-2/inference/train_data_inference.csv", index_col=0)
test = pd.read_csv("/export/home/aneezahm001/IR/data/hand-labelled-old-new-2/inference/test_data_inference.csv", index_col=0)

In [43]:
test.columns

Index(['processed_content', 'sentiment', 'username', 'content', 'date',
       'country', 'replyCount', 'retweetCount', 'likeCount', 'url',
       'textblob_class', 'vader_class', 'inference'],
      dtype='object')

In [44]:
train.columns

Index(['processed_content', 'sentiment', 'username', 'content', 'date',
       'country', 'replyCount', 'retweetCount', 'likeCount', 'url',
       'textblob_class', 'vader_class', 'inference'],
      dtype='object')

In [6]:
#Preprocess tweets and save in new column
cleantext=[]
i=0
for item in train['content']:
    if type(item)==float:
      item = str(item)
    words=preprocess_tweet(item)
    cleantext+=[words]
    i+=1
    if i%500==0:
      print(i)
train['processed_content']=cleantext

#Preprocess tweets and save in new column
cleantext=[]
i=0
for item in test['content']:
    if type(item)==float:
      item = str(item)
    words=preprocess_tweet(item)
    cleantext+=[words]
    i+=1
    if i%500==0:
      print(i)
test['processed_content']=cleantext

500
1000
1500
2000
2500
3000
3500
