In [1]:
# !pip install emot
# !pip install symspellpy
# !pip install setuptools wheel
# !pip install spacy
# !pip install pyspellchecker
# !pip install spellchecker
# !pip install inflect
# !pip install langdetect
# !pip install google_trans_new
# !pip install deep-translator
# !pip install ekphrasis
# !pip install cdifflib
# !pip install gensim
# !pip install wordninja

In [2]:
import nltk
from nltk.corpus import PlaintextCorpusReader
import re, string, json

from symspellpy.symspellpy import SymSpell, Verbosity
import pkg_resources
import spacy
from tqdm.notebook import tqdm
from spellchecker import SpellChecker
import inflect

import difflib
import gensim

import pandas as pd

## Text File & Corpus Creation

In [3]:
# create txt file for each text review

# def CreateCorpusFromDataFrame(corpusfolder, df):
#     for index, r in df.iterrows():
#     # print (r['text'])
#     # print (str(index) + '.txt')
#         text = r['translated']
#         fname = str(index)+'.txt'
#         corpusfile = open(corpusfolder+'/'+fname, "w", encoding="utf-8")
#         corpusfile.write(str(text))
#         corpusfile.close()

# CreateCorpusFromDataFrame('../text-corpus-india-v1', df)

# reference: https://stackoverflow.com/questions/49088978/how-to-create-corpus-from-pandas-data-frame-to-operate-with-nltk/49104725

In [4]:
file_directory = '../../text-corpus-india-v1'
filename_pattern = '.+\.txt'

my_corpus = PlaintextCorpusReader(file_directory, filename_pattern)

#Print some words in specific file
test = my_corpus.words('0.txt')
print(test)
print(len(test))

['soniya', 'mummy', 'hoti', 'toh', 'covid', 'me', ...]
26


In [5]:
#Total number of text files i.e. total number of reviews
print(len(my_corpus.fileids()))

95188


In [6]:
fdist = nltk.FreqDist(test)
print(fdist.most_common(10))

[('soniya', 1), ('mummy', 1), ('hoti', 1), ('toh', 1), ('covid', 1), ('me', 1), ('magic', 1), ('stick', 1), ('ghuma', 1), ('ke', 1)]


## Tokenization

In [7]:
#Creating a sorted list of texts since my_corpus.fileids() does not read texts consecutively
sorted_list = [f'{i}.txt' for i in range(38000, 41000)]

In [8]:
tokenised_words = [my_corpus.words(each) for each in sorted_list]
tokenised_words

[['finally', 'covid', '19', 'vaccine'],
 ['assam', 't', 'covid', 'r', 'vaccine', 'ahi', ...],
 ['first', 'batch', '#', 'covidvaccine', 'ne', ...],
 ['state', 'task', 'force', 'stf', 'held', 'it', 'is', ...],
 ['strengthening', 'efforts', 'health', 'counselors', ...],
 ['covid', 'vaccine', 'arrive', 'at', '20', 'cartons', ...],
 ['#', 'new_video_in_hindi', 'The', 'government', ...],
 ['Corona', 'vaccine', '#', 'covidvaccine', '#', ...],
 ['this', 'is', 'what', 'entire', 'country', 'India', ...],
 ['lowest', 'case', '#', 'vaccine', 'is', 'ready', '#', ...],
 ['I', 'want', 'to', 'avoid', 'children', 'getting', ...],
 ['if', 'bjp', 'will', 'come', 'power', ',', 'every', ...],
 ['#', '4newstrains', '...', 'is', 'new', 'challenge', ...],
 ['Something', 'happy', 'every', 'time', 'I', 'visit', ...],
 ['beware', 'indian', 'cargo', 'planes', 'with', ...],
 ['live', 'update', 'out', '6', '#', 'covidvaccine', ...],
 ['yes', ',', 'students', 'can', 'go', 'attend', ...],
 ['great', 'effort', 'eradic

In [9]:
print(len(tokenised_words))

3000


In [10]:
tokenised_words[-1]

['party_popper', 'will', 'get', 'it', 'now', '.', ...]

## Converting British to American English

In [11]:
#https://stackoverflow.com/questions/42329766/python-nlp-british-english-vs-american-english

import requests

def americanize(word):
    url ="https://raw.githubusercontent.com/hyperreality/American-British-English-Translator/master/data/british_spellings.json"
    british_to_american_dict = requests.get(url).json()

    for british_spelling, american_spelling in british_to_american_dict.items():
        word = word.replace(british_spelling, american_spelling)
  
    return word

In [12]:
# import requests

# def britishize(string):
#     url ="https://raw.githubusercontent.com/hyperreality/American-British-English-Translator/master/data/american_spellings.json"
#     american_to_british_dict = requests.get(url).json()    

#     for american_spelling, british_spelling in american_to_british_dict.items():
#         string = string.replace(american_spelling, british_spelling)
  
#     return string

In [13]:
test_americanize = [['flavor', 'color'], ['apologize', 'traveled']]
test_americanize = [[americanize(w) for w in each] for each in test_americanize]
test_americanize

[['flavor', 'color'], ['apologize', 'traveled']]

In [14]:
# test_britishize = [['flavor', 'color'], ['apologize', 'traveled']]
# test_britishize = [[britishize(w) for w in each] for each in test_britishize]
# test_britishize

In [15]:
# len(tokenised_removed_campaigns)

In [16]:
tokenised_amer = [[americanize(w) for w in each] for each in tokenised_words]
tokenised_amer

[['finally', 'covid', '19', 'vaccine'],
 ['assam', 't', 'covid', 'r', 'vaccine', 'ahi', 'golslightly_smiling_face'],
 ['first',
  'batch',
  '#',
  'covidvaccine',
  'ne',
  'reached',
  'lgbi',
  'airport',
  ',',
  '#',
  'guwahati',
  '.',
  'around',
  '20',
  'boxes',
  'vaccines',
  'received',
  'today',
  'at',
  'guwahati',
  ',',
  'out',
  'which',
  '17',
  'are',
  'assam',
  '3',
  'are',
  'meghalaya',
  '.'],
 ['state',
  'task',
  'force',
  'stf',
  'held',
  'it',
  'is',
  'second',
  'meeting',
  'covid',
  '-',
  '19',
  'vaccine',
  '.',
  '#',
  'nagaland',
  'read',
  '-'],
 ['strengthening',
  'efforts',
  'health',
  'counselors',
  'effective',
  'roll',
  'out',
  'covid',
  'vaccine'],
 ['covid',
  'vaccine',
  'arrive',
  'at',
  '20',
  'cartons',
  'vaccines',
  'arrive',
  'at'],
 ['#',
  'new_video_in_hindi',
  'The',
  'government',
  'should',
  'rethink',
  'again',
  '#',
  'covidvaccine',
  ',',
  'do',
  'not',
  'take',
  'any',
  'step',
  'in

In [17]:
convert_to_str = []

for each in tokenised_amer:
    converted_str = ""
    for i in range(0,len(each)):
        if i == (len(each) - 1):
            converted_str += each[i]
        else:
            converted_str += each[i] + ", "
    convert_to_str.append(converted_str)
    
converted_df = pd.DataFrame(convert_to_str)

converted_df.to_csv('american-38k-41k.csv')

## Conversion of Numerals

In [18]:
# new_df = pd.read_csv("american-2000-4000.csv")

In [19]:
# new_df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [20]:
# thelist = []

# for row in new_df['0']:
#     alist = row.split(", ")
#     thelist.append(alist)

In [21]:
# thelist

In [22]:
#check if text has numbers 
def hasNumbers(inputList):
    for list in inputList:
        for word in list:
            if word.isdigit():
                return True
    return False

hasNumbers(tokenised_words)

True

In [23]:
# since theres numbers, the numbers need to be converted to words e.g. "2" to "two"

p = inflect.engine()
  
# convert number into words
def convert_number(text):
    # split string into list of words
    temp_str = text.split()
    # initialise empty list
    new_string = []
  
    for word in temp_str:
        # if word is a digit, convert the digit
        # to numbers and append into the new_string list
        if word.isdigit():
            temp = p.number_to_words(word)
            new_string.append(temp)
  
        # append the word as it is
        else:
            new_string.append(word)
  
    # join the words of new_string to form a string
    temp_str = ' '.join(new_string)
    return temp_str

In [24]:
# example text which contain numbers in it
input_list = ['There are 3 balls in this bag, and 12 in the other one.', '29']
result = [convert_number(each)for each in input_list]
result

['There are three balls in this bag, and twelve in the other one.',
 'twenty-nine']

In [25]:
#NEED TO CHANGE VARIABLE after us to brit eng conversion

tokenised_numerals_converted = [[convert_number(w) for w in each] for each in tokenised_amer]

In [26]:
#check if text has numbers 
def hasNumbers(inputList):
    for list in inputList:
        for word in list:
            if word.isdigit():
                return True
    return False

hasNumbers(tokenised_numerals_converted)

False

## Misspellings

In [27]:
def correct_typo(tokens):
    spell = SpellChecker()
    return [spell.correction(t) if len(spell.unknown([t]))>0 else t for t in tokens]

In [28]:
misspelled_test = ['greot', 'coyor', 'and', 'misspeling']
correct_typo(misspelled_test)

['great', 'color', 'and', 'misspelling']

In [29]:
tokenised_spellchecked = [correct_typo(each) for each in tokenised_numerals_converted]

In [30]:
tokenised_spellchecked

[['finally', 'couid', 'nineteen', 'vaccine'],
 ['assay', 'i', 'couid', 'i', 'vaccine', 'ahi', 'golslightly_smiling_face'],
 ['first',
  'batch',
  '#',
  'covidvaccine',
  'ne',
  'reached',
  'lgby',
  'airport',
  ',',
  '#',
  'guwahati',
  '.',
  'around',
  'twenty',
  'boxes',
  'vaccines',
  'received',
  'today',
  'at',
  'guwahati',
  ',',
  'out',
  'which',
  'seventeen',
  'are',
  'assay',
  'three',
  'are',
  'meghalaya',
  '.'],
 ['state',
  'task',
  'force',
  'sta',
  'held',
  'it',
  'is',
  'second',
  'meeting',
  'couid',
  '-',
  'nineteen',
  'vaccine',
  '.',
  '#',
  'nagaland',
  'read',
  '-'],
 ['strengthening',
  'efforts',
  'health',
  'counselors',
  'effective',
  'roll',
  'out',
  'couid',
  'vaccine'],
 ['couid',
  'vaccine',
  'arrive',
  'at',
  'twenty',
  'cartons',
  'vaccines',
  'arrive',
  'at'],
 ['#',
  'new_video_in_hindi',
  'The',
  'government',
  'should',
  'rethink',
  'again',
  '#',
  'covidvaccine',
  ',',
  'do',
  'not',
  '

In [31]:
convert_to_str = []

for each in tokenised_spellchecked:
    converted_str = ""
    for i in range(0,len(each)):
        if i == (len(each) - 1):
            converted_str += each[i]
        else:
            converted_str += each[i] + ", "
    convert_to_str.append(converted_str)
    
converted_df = pd.DataFrame(convert_to_str)

converted_df.to_csv('corrected-38k-41k.csv')