In [82]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import nltk.tokenize as token
from nltk.tokenize import WhitespaceTokenizer, WordPunctTokenizer, TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import time
import emoji
import string
from spellchecker import SpellChecker
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 
import numpy as np
from string import punctuation

stop_words = set(stopwords.words('english'))
stop_words.remove("very")
stop_words.add("th")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/development/nihars/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/development/nihars/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [84]:
# stop_words

In [99]:
def convert_to_lower(text):
    # return the reviews after convering then to lowercase
    # Words with different cases are intercepted differently such as 'The' and 'the'. 
    # Hence all words should be converted into same case, preferably lower case.
    l = []
    for t in text:
        l.append(t.lower())
    return l

def remove_punctuation(text):
    # return the reviews after removing punctuations
    # Refer: https://www3.ntu.edu.sg/home/ehchua/programming/howto/Regexe.html
    l = []
    # \w : word character
    # \W : non-word character
    # \d : digits
    # \D : non-digits
    
    for t in text:
        l.append(re.sub(r'[^\w\s]|^\s\d+\s|\s\d+|\d+|\s\d+$', ' ', t)) #|^\s\d+\s|\s\d+|\d+|\s\d+$
    return l

# def remove_punctuation(text):
#     text = text.translate(str.maketrans('', '', string.punctuation))
#     return text

def remove_stopwords(text):
    # return the reviews after removing the stopwords
    # Stopwords are the most common words in a language. For example 'is', 'the', 'that' etc. are stopwords in English language. Stopwords shall be removed during text clean-up phase. However removing stop word can change the meaning of sentence. 
    # For instance 'I didn't love politics' will get converted to 'I love politics' after removing stopword.  
    l = []
    large = 0
    for t in text:
        word_tokens = token.word_tokenize(t)
        filtered_sentence = [w for w in word_tokens if not w in stop_words]
        l.append(filtered_sentence)
    return l

def remove_URLs(text):
    text  = re.sub(r"https?://\S+|www\.\S+", "", text )
    return text

def remove_digits(text):
    text= re.sub(r'[0-9]','',text)
    return text

def remove_spaces(text):
    text = re.sub(' +', ' ', text)
    return text

def perform_tokenization(text):
    # return the reviews after performing tokenization
    text = token.word_tokenize(text)
#     tk = Whitespa/ceTokenizer()
#     tk = WordPunctTokenizer()
#     tk = TreebankWordTokenizer()
#     text = tk.tokenize(text)
    
    
    return text

def perform_padding(data):
    # return the reviews after padding the reviews to maximum length
    maxlen = 30
    return pad_sequences(data, maxlen=maxlen, padding="post")

def correct_spellings(text):
    # At times textual data such as social media data is prone to spelling errors. Spelling errors 
    # should be rectified early during the clean-up phase. Fortunately we have libraries available for spelling correction.
    spell = SpellChecker()
    corrected_words = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_words.append(spell.correction(word))
        else:
            corrected_words.append(word)
    return " ".join(corrected_words)

def convert_emoji(text):
    text = emoji.demojize(text)
    return text

def convert_to_antonym(sentence):
    words = nltk.word_tokenize(sentence)
    new_words = []
    temp_word = ''
    for word in words:
        antonyms = []
        if word == 'not':
            temp_word = 'not_'
        elif temp_word == 'not_':
            for syn in wordnet.synsets(word):
                for s in syn.lemmas():
                    for a in s.antonyms():
                        antonyms.append(a.name())
            if len(antonyms) >= 1:
                word = antonyms[0]
            else:
                word = temp_word + word # when antonym is not found, it will
                                    # remain not_happy
            
            temp_word = ''
        if word != 'not':
            new_words.append(word)
    return ' '.join(new_words)

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
# without wordnet map it takes evey word as noun
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV }

def stem_words(text):
    # a process of removing and replacing suffixes to get the root form of the word.
    # Porterstemmer is rule based. (eg: dogs -> dog)
    return " ".join([stemmer.stem(word) for word in text.split()])

def lemma_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word ,wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])


In [103]:
# text = 'Python is 👍'
# text = "He was not happy with the score of team"
# text = " David wanted to go with Alfa but Alfa went with Charli so David is going with Bravo"
# text = "This is not the most important topic"
text = 'Shall I search the answer in www.google.com ?'
# text ="Being no 1 team is more important or being no 3 but with fair play "
# text = "This! sentence, contains so: many - punctuations."
# text = "Correcting   double  space  text "
# text = "Spelling correctin is proprly perfrmed"
# text = "Biden shouldn't interfere in India's foreign policy."

In [104]:
remove_URLs(text)

'Shall I search the answer in  ?'

In [107]:
train_file = "./train.csv"
test_file = "./test.csv"
gold_file = "./gold_test.csv"

In [111]:
train = pd.read_csv(train_file)
train = train.loc[:, ~train.columns.str.contains('^Unnamed')]
train.head()

Unnamed: 0,reviews,ratings
0,"This book was very informative, covering all a...",4
1,I am already a baseball fan and knew a bit abo...,5
2,I didn't like this product it smudged all unde...,1
3,I simply love the product. I appreciate print ...,5
4,It goes on very easily and makes my eyes look ...,5


In [110]:
def clean_df(temp_df):
  temp_df['reviews'] = temp_df['reviews'].str.replace('(@\w+\s*)',"")     # remove username e.g -> @name
  temp_df['reviews'] = temp_df['reviews'].str.replace('(&#\w+\s*)',"")     # remove html noise e.g -> &#1334
  temp_df['reviews'] = temp_df['reviews'].str.replace('https?://[A-Za-z0-9./]+','') # remove URLs
  temp_df['reviews'] = temp_df['reviews'].str.replace('[^\.\?\w\s]','') # remove punctuation except '.'
  temp_df['reviews'] = temp_df['reviews'].str.replace('RT','') # remove 'RT (Retweet)'
  temp_df['reviews'] = temp_df['reviews'].str.replace('\n',' ') # remove '\n'
#   temp_df = temp_df.drop_duplicates(subset = [ 'ratings', 'reviews'])   # remove duplicate rows

  return temp_df

In [112]:
train.iloc[49995]

reviews    it does not work((((((((((((
ratings                               1
Name: 49995, dtype: object

In [113]:
train = clean_df(train)
train.iloc[49995]

reviews    it does not work
ratings                   1
Name: 49995, dtype: object