Importing required packages

In [1]:
!pip install nltk
import nltk
import string
import re
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [2]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

Initialising the paragraph

In [3]:
paragraph = "helloooooo, i'm currently not studying in Surat.\nToday's my bday."

In [4]:
R_patterns = [
   (r'won\'t', 'will not'),
   (r'can\'t', 'cannot'),
   (r'i\'m', 'i am'),
   (r'(\w+)\'ll', '\g<1> will'),
   (r'(\w+)n\'t', '\g<1> not'),
   (r'(\w+)\'ve', '\g<1> have'),
   (r'(\w+)\'s', '\g<1> is'),
   (r'(\w+)\'re', '\g<1> are'),
]
class REReplacer(object):
  def __init__(self, patterns=R_patterns):
    self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
  def replace(self, text):
    s = text
    for (pattern, repl) in self.patterns:
      s = re.sub(pattern, repl, s)
    return s
rep_word = REReplacer()
rep_paragraph = rep_word.replace(paragraph)
print(rep_paragraph)

helloooooo, i am currently not studying in Surat.
Today is my bday.


Tokenizing the paragraph

In [5]:
words = word_tokenize(rep_paragraph)
print(words)

['helloooooo', ',', 'i', 'am', 'currently', 'not', 'studying', 'in', 'Surat', '.', 'Today', 'is', 'my', 'bday', '.']


Removing Punctuations

In [6]:
only_words = [w for w in words if not w in string.punctuation]
print(only_words)

['helloooooo', 'i', 'am', 'currently', 'not', 'studying', 'in', 'Surat', 'Today', 'is', 'my', 'bday']


Removing repeated letters in words

In [7]:
from nltk.corpus import wordnet

class Rep_word_removal(object):
  def __init__(self):
    self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
    self.repl = r'\1\2\3'
  def replace(self, word):
    if wordnet.synsets(word):
      return word
    replace_word = self.repeat_regexp.sub(self.repl, word)
    if replace_word != word:
      return self.replace(replace_word)
    else:
      return replace_word
rep_words = []
rep_word = Rep_word_removal()
for i in only_words:
  rep_words.append(rep_word.replace(i))
print(rep_words)

['hello', 'i', 'am', 'currently', 'not', 'studying', 'in', 'Surat', 'Today', 'is', 'my', 'bday']


Importing CSVs

In [8]:
import csv
import pandas as pd
import requests

In [18]:
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')
filesyn = pd.read_csv('/content/drive/My Drive/synonyms.csv', header=None)
fileant = pd.read_csv('/content/drive/My Drive/antonyms.csv', header=None)
print('\nSynonyms')
print(filesyn)
print('\nAntonyms')
print(fileant)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
           0          1
0       bday   birthday
1      hello         hi
2  currently  presently

          0         1
0  studying  sleeping


Replacing Synonyms

In [13]:
class csv_synonym_replacer(object):
  def __init__(self, fname):
    self.word_map = {}
    for i in range(len(fname)):
      self.word_map[fname.iloc[i,0]] = fname.iloc[i,1]
  
  def replace(self, word):
        return self.word_map.get(word, word)
syn_replacer = csv_synonym_replacer(filesyn)
syn_words = [];
for i in rep_words:
  syn_words.append(syn_replacer.replace(i))
print(syn_words)

['hi', 'i', 'am', 'presently', 'not', 'studying', 'in', 'Surat', 'Today', 'is', 'my', 'birthday']


Replacing Antonyms

In [19]:
class csv_antonym_replacer(object):

  def __init__(self, fname):
    self.word_map = {}
    for i in range(len(fname)):
      self.word_map[fname.iloc[i,0]] = fname.iloc[i,1]
  
  def replace(self, word):
        return self.word_map.get(word, word)
  
  def replace_negations(self, sent):
    i, l = 0, len(sent)
    words = []
    
    while i < l:
        word = sent[i]
        
        if word == 'not' and i+1 < l:
          ant = self.replace(sent[i+1])
          
          if ant:
              words.append(ant)
              i += 2
              continue
        
        words.append(word)
        i += 1
    
    return words

ant_replacer = csv_antonym_replacer(fileant)
ant_words = [];
for i in rep_words:
  ant_words.append(ant_replacer.replace_negations(i))
ant_words = ant_replacer.replace_negations(rep_words)
print(ant_words)

['hello', 'i', 'am', 'currently', 'sleeping', 'in', 'Surat', 'Today', 'is', 'my', 'bday']
