In [41]:
import re
import string 
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

#NLTK POS tagger
import nltk
nltk.download('averaged_perceptron_tagger')

#intialize stopwords
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
from html.parser import HTMLParser
import html

from autocorrect import Speller
from langdetect import detect
import itertools

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\metes\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\metes\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
text = " #Tesla is reevaluating the wy it sels electric cars in China? its second-largest market,Beijing where traffic plunged during COVID restrictions. Francis Maguire reports. "


In [7]:
def split_into_sentences(text):
   return sent_tokenize(text)

split_into_sentences(text)

['Tesla is reevaluating the way it sells electric cars in China.',
 'its second-largest market,Beijing where traffic plunged during COVID restrictions.',
 'Francis Maguire reports.']

In [8]:
def split_into_words(text):
   return word_tokenize(text)
split_into_words(text)

['Tesla',
 'is',
 'reevaluating',
 'the',
 'way',
 'it',
 'sells',
 'electric',
 'cars',
 'in',
 'China',
 '.',
 'its',
 'second-largest',
 'market',
 ',',
 'Beijing',
 'where',
 'traffic',
 'plunged',
 'during',
 'COVID',
 'restrictions',
 '.',
 'Francis',
 'Maguire',
 'reports',
 '.']

In [9]:
def lower_case_text(text):
    return text.lower()

lower_case_text(text)

'tesla is reevaluating the way it sells electric cars in china. its second-largest market,beijing where traffic plunged during covid restrictions. francis maguire reports.'

In [13]:
def remove_punctuation(text):
    text = " ".join([word for word in  word_tokenize(text) if word not in (string.punctuation)])
    return text

remove_punctuation(text)

'Tesla is reevaluating the way it sells electric cars in China its second-largest market Beijing where traffic plunged during COVID restrictions Francis Maguire reports'

In [16]:

def remove_unicode(text):
    text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", " ", text)
     
    return " ".join(text.split())

remove_unicode(text)

'Tesla is reevaluating the way it sells electric cars in China its second largest market Beijing where traffic plunged during COVID restrictions Francis Maguire reports'

In [19]:
def remove_leading_trailing_whitespaces(text):
    text = re.sub(r"^\s+|\s+$", "", text)
    return text

remove_leading_trailing_whitespaces(text)

'Tesla is reevaluating the way it sells electric cars in China? its second-largest market,Beijing where traffic plunged during COVID restrictions. Francis Maguire reports.'

In [25]:
def remove_duplicate_whitespaces(text):
    return " ".join(text.split())
remove_duplicate_whitespaces(text)

'Tesla is reevaluating the way it sells electric cars in China? its second-largest market,Beijing where traffic plunged during COVID restrictions. Francis Maguire reports.'

In [26]:
def detect_language(text):
    try:
        language = detect(text)
    except Exception as ex:
        print("language can not be detected")
    return language

detect_language(text)

'en'

In [29]:
def correct_grammar(text):
    #One letter in a word should not be present more than twice in continuation
    text = ''.join(''.join(s)[:2] for _, s in itertools.groupby(text))
    spell_checker = Speller(lang='en')
    text = spell_checker(text)
    return text 
correct_grammar(text)

' Tesla is evaluating the wy it self electric cars in China? its second-largest market,Beijing where traffic plunged during COVID restrictions. Francis Mature reports. '

In [33]:
def remove_stopwrods(text): 
    stop = stopwords.words('english')
    text = " ".join([word for word in word_tokenize(text) if word not in (stop)])
    return text
remove_stopwrods(text)

'Tesla reevaluating wy sels electric cars China ? second-largest market , Beijing traffic plunged COVID restrictions . Francis Maguire reports .'

In [34]:
def apply_stemming(text):
    stemmer = PorterStemmer()
    text = " ".join([stemmer.stem(word) for word in word_tokenize(text)])
    return text

apply_stemming(text)

'tesla is reevalu the wy it sel electr car in china ? it second-largest market , beij where traffic plung dure covid restrict . franci maguir report .'

In [35]:
def apply_lammatization(text):
    lemmatizer = WordNetLemmatizer()
    text = " ".join([lemmatizer.lemmatize(word) for word in word_tokenize(text)])
    return text
apply_lammatization(text)

'Tesla is reevaluating the wy it sels electric car in China ? it second-largest market , Beijing where traffic plunged during COVID restriction . Francis Maguire report .'

In [37]:
def remove_hashtags(text):
    text = re.sub(r'#', '', text)
    return text

remove_hashtags(text)

' Tesla is reevaluating the wy it sels electric cars in China? its second-largest market,Beijing where traffic plunged during COVID restrictions. Francis Maguire reports. '

In [42]:
def clean_html_code(text):
    """A Lot of HTML entities like &apos; ,&amp; ,&lt; etc can be found in most of the data available on the web.

    Args:
        text (_type_): _description_

    Returns:
        _type_: _description_
    """    
    text = html.unescape(text)
    return text

clean_html_code(text)

' #Tesla is reevaluating the wy it sels electric cars in China? its second-largest market,Beijing where traffic plunged during COVID restrictions. Francis Maguire reports. '

In [43]:
def replace_contraction(text):
    apostrophe_dict={"'s":" is","n't":" not","'m":" am","'ll":" will",
           "'d":" would","'ve":" have","'re":" are"}
 
    #replace the contractions
    for key,value in apostrophe_dict.items():
        if key in text:
            text=text.replace(key,value)
    return text

replace_contraction(text)

' #Tesla is reevaluating the wy it sels electric cars in China? its second-largest market,Beijing where traffic plunged during COVID restrictions. Francis Maguire reports. '

In [44]:
 
         







 



def get_pos_tags(text):
    tokens = nltk.word_tokenize(text)
    pos = nltk.pos_tag(tokens)
    return pos
get_pos_tags(text)

[('#', '#'),
 ('Tesla', 'NNP'),
 ('is', 'VBZ'),
 ('reevaluating', 'VBG'),
 ('the', 'DT'),
 ('wy', 'NN'),
 ('it', 'PRP'),
 ('sels', 'VBZ'),
 ('electric', 'JJ'),
 ('cars', 'NNS'),
 ('in', 'IN'),
 ('China', 'NNP'),
 ('?', '.'),
 ('its', 'PRP$'),
 ('second-largest', 'JJ'),
 ('market', 'NN'),
 (',', ','),
 ('Beijing', 'NNP'),
 ('where', 'WRB'),
 ('traffic', 'NN'),
 ('plunged', 'VBD'),
 ('during', 'IN'),
 ('COVID', 'NNP'),
 ('restrictions', 'NNS'),
 ('.', '.'),
 ('Francis', 'NNP'),
 ('Maguire', 'NNP'),
 ('reports', 'NNS'),
 ('.', '.')]