<a href="https://colab.research.google.com/github/ritwiks9635/NLP_Project_File/blob/main/Text__Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Preprocessing

In [None]:
import numpy as np
import pandas as pd
import string
import nltk
import re

In [None]:
data = pd.read_csv("twcs/twcs.csv", nrows = 5000)
data.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0


In [None]:
data.shape

(5000, 7)

In [None]:
data = data[["text"]].astype(str)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5000 non-null   object
dtypes: object(1)
memory usage: 39.2+ KB


In [None]:
data["lower_text"] = data["text"].str.lower()
data.head()

Unnamed: 0,text,lower_text
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...
4,@sprintcare I did.,@sprintcare i did.


In [None]:
punct = string.punctuation
def punct_text(text):
  return text.translate(str.maketrans('', '', punct))

In [None]:
data["punct_text"] = data["text"].apply(lambda x : punct_text(x))

In [None]:
data.head()

Unnamed: 0,text,lower_text,punct_text
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...,115712 I understand I would like to assist you...
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that,sprintcare and how do you propose we do that
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...,sprintcare I have sent several private message...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...,115712 Please send us a Private Message so tha...
4,@sprintcare I did.,@sprintcare i did.,sprintcare I did


In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stopwords = set(stopwords.words("english"))

In [None]:
def remove_word(text):
  return " ".join([word for word in str(text).split() if word not in stopwords])

In [None]:
data["stopword"] = data["text"].apply(lambda x : remove_word(x))

In [None]:
data.head()

Unnamed: 0,text,lower_text,punct_text,stopword
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...,115712 I understand I would like to assist you...,@115712 I understand. I would like assist you....
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that,sprintcare and how do you propose we do that,@sprintcare propose
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...,sprintcare I have sent several private message...,@sprintcare I sent several private messages on...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...,115712 Please send us a Private Message so tha...,@115712 Please send us Private Message assist ...
4,@sprintcare I did.,@sprintcare i did.,sprintcare I did,@sprintcare I did.


In [None]:
from collections import Counter
cnt = Counter()
for text in data["stopword"].values:
  for word in text.split():
    cnt[word] += 1
cnt.most_common(10)

[('I', 1431),
 ('us', 658),
 ('DM', 436),
 ('Please', 372),
 ('We', 338),
 ('get', 277),
 ('like', 228),
 ('@Tesco', 222),
 ("I'm", 221),
 ('Hi', 218)]

In [None]:
frequent = set([w for w, i in cnt.most_common(10)])
def remove_freq(text):
  return " ".join([word for word in str(text).split() if word not in frequent])

In [None]:
data["freq_text"] = data["text"].apply(lambda x : remove_freq(x))

In [None]:
data.head()

Unnamed: 0,text,lower_text,punct_text,stopword,freq_text
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...,115712 I understand I would like to assist you...,@115712 I understand. I would like assist you....,@115712 understand. would to assist you. would...
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that,sprintcare and how do you propose we do that,@sprintcare propose,@sprintcare and how do you propose we do that
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...,sprintcare I have sent several private message...,@sprintcare I sent several private messages on...,@sprintcare have sent several private messages...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...,115712 Please send us a Private Message so tha...,@115712 Please send us Private Message assist ...,@115712 send a Private Message so that we can ...
4,@sprintcare I did.,@sprintcare i did.,sprintcare I did,@sprintcare I did.,@sprintcare did.


In [None]:
n_rare_words = 10

In [None]:
rare_word = set([w for w, i in cnt.most_common()[:-n_rare_words-1:-1]])

In [None]:
def remove_rare(text):
  return " ".join([word for word in str(text).split() if word not in rare_word])

In [None]:
data["rare_text"] = data["text"].apply(lambda x : remove_rare(x))

In [None]:
data.head()

Unnamed: 0,text,lower_text,punct_text,stopword,freq_text,rare_text
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...,115712 I understand I would like to assist you...,@115712 I understand. I would like assist you....,@115712 understand. would to assist you. would...,@115712 I understand. I would like to assist y...
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that,sprintcare and how do you propose we do that,@sprintcare propose,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...,sprintcare I have sent several private message...,@sprintcare I sent several private messages on...,@sprintcare have sent several private messages...,@sprintcare I have sent several private messag...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...,115712 Please send us a Private Message so tha...,@115712 Please send us Private Message assist ...,@115712 send a Private Message so that we can ...,@115712 Please send us a Private Message so th...
4,@sprintcare I did.,@sprintcare i did.,sprintcare I did,@sprintcare I did.,@sprintcare did.,@sprintcare I did.


In [None]:
from nltk.stem.porter import PorterStemmer
steamer = PorterStemmer()
def stem_text(text):
  return " ".join([steamer.stem(word) for word in text.split()])

In [None]:
data["stem_text"] = data["text"].apply(lambda x : stem_text(x))

In [None]:
data.head()

Unnamed: 0,text,lower_text,punct_text,stopword,freq_text,rare_text,stem_text
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...,115712 I understand I would like to assist you...,@115712 I understand. I would like assist you....,@115712 understand. would to assist you. would...,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that,sprintcare and how do you propose we do that,@sprintcare propose,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that,@sprintcar and how do you propos we do that
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...,sprintcare I have sent several private message...,@sprintcare I sent several private messages on...,@sprintcare have sent several private messages...,@sprintcare I have sent several private messag...,@sprintcar i have sent sever privat messag and...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...,115712 Please send us a Private Message so tha...,@115712 Please send us Private Message assist ...,@115712 send a Private Message so that we can ...,@115712 Please send us a Private Message so th...,@115712 pleas send us a privat messag so that ...
4,@sprintcare I did.,@sprintcare i did.,sprintcare I did,@sprintcare I did.,@sprintcare did.,@sprintcare I did.,@sprintcar i did.


In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
data["lemmatize_text"] = data["text"].apply(lambda x : lemmatize_words(x))

In [None]:
data.head()

Unnamed: 0,text,lower_text,punct_text,stopword,freq_text,rare_text,stem_text,lemmatize_text
0,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...,115712 I understand I would like to assist you...,@115712 I understand. I would like assist you....,@115712 understand. would to assist you. would...,@115712 I understand. I would like to assist y...,@115712 i understand. i would like to assist y...,@115712 I understand. I would like to assist y...
1,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that,sprintcare and how do you propose we do that,@sprintcare propose,@sprintcare and how do you propose we do that,@sprintcare and how do you propose we do that,@sprintcar and how do you propos we do that,@sprintcare and how do you propose we do that
2,@sprintcare I have sent several private messag...,@sprintcare i have sent several private messag...,sprintcare I have sent several private message...,@sprintcare I sent several private messages on...,@sprintcare have sent several private messages...,@sprintcare I have sent several private messag...,@sprintcar i have sent sever privat messag and...,@sprintcare I have sent several private messag...
3,@115712 Please send us a Private Message so th...,@115712 please send us a private message so th...,115712 Please send us a Private Message so tha...,@115712 Please send us Private Message assist ...,@115712 send a Private Message so that we can ...,@115712 Please send us a Private Message so th...,@115712 pleas send us a privat messag so that ...,@115712 Please send u a Private Message so tha...
4,@sprintcare I did.,@sprintcare i did.,sprintcare I did,@sprintcare I did.,@sprintcare did.,@sprintcare I did.,@sprintcar i did.,@sprintcare I did.


In [None]:
print("Word is : stripes")
print("Lemma result for verb : ",lemmatizer.lemmatize("stripes", 'v'))
print("Lemma result for noun : ",lemmatizer.lemmatize("stripes", 'n'))

Word is : stripes
Lemma result for verb :  strip
Lemma result for noun :  stripe


In [None]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
data["lemmatize_words"] = data["text"].apply(lambda x : lemmatize_words(x))

In [None]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [None]:
remove_emoji("this is good 🔥🔥")

'this is good '

In [None]:
remove_emoji("this is good 😄😁")

'this is good '

In [None]:
first copy the emotion ("https://github.com/NeelShah18/emot/blob/master/emot/emo_unicode.py")
then this code run

In [None]:
def remove_emoticons(text):
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    return emoticon_pattern.sub(r'', text)

remove_emoticons("Hello :-)")

In [None]:
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text

text = "Hello :-) :-)"
convert_emoticons(text)

In [None]:
first copy the EMO_UNICODE using this ("https://github.com/NeelShah18/emot/blob/master/emot/emo_unicode.py")
then this code run

In [None]:
def convert_emojis(text):
    for emot in UNICODE_EMO:
        text = re.sub(r'('+emot+')', "_".join(UNICODE_EMO[emot].replace(",","").replace(":","").split()), text)
    return text

text = "game is on 🔥"
convert_emojis(text)

In [None]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

In [None]:
def remove_html_tag(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

text = """<div>
<h1> H2O</h1>
<p> AutoML</p>
<a href="https://www.h2o.ai/products/h2o-driverless-ai/"> Driverless AI</a>
</div>"""

print(remove_html(text))

In [None]:
#also use

In [None]:
from bs4 import BeautifulSoup

def remove_html(text):
    return BeautifulSoup(text, "lxml").text

text = """<div>
<h1> H2O</h1>
<p> AutoML</p>
<a href="https://www.h2o.ai/products/h2o-driverless-ai/"> Driverless AI</a>
</div>
"""

print(remove_html(text))

In [None]:
chat_words_str = """
AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace
BAK=Back At Keyboard
BBL=Be Back Later
BBS=Be Back Soon
BFN=Bye For Now
B4N=Bye For Now
BRB=Be Right Back
BRT=Be Right There
BTW=By The Way
B4=Before
B4N=Bye For Now
CU=See You
CUL8R=See You Later
CYA=See You
FAQ=Frequently Asked Questions
FC=Fingers Crossed
FWIW=For What It's Worth
FYI=For Your Information
GAL=Get A Life
GG=Good Game
GN=Good Night
GMTA=Great Minds Think Alike
GR8=Great!
G9=Genius
IC=I See
ICQ=I Seek you (also a chat program)
ILU=ILU: I Love You
IMHO=In My Honest/Humble Opinion
IMO=In My Opinion
IOW=In Other Words
IRL=In Real Life
KISS=Keep It Simple, Stupid
LDR=Long Distance Relationship
LMAO=Laugh My A.. Off
LOL=Laughing Out Loud
LTNS=Long Time No See
L8R=Later
MTE=My Thoughts Exactly
M8=Mate
NRN=No Reply Necessary
OIC=Oh I See
PITA=Pain In The A..
PRT=Party
PRW=Parents Are Watching
ROFL=Rolling On The Floor Laughing
ROFLOL=Rolling On The Floor Laughing Out Loud
ROTFLMAO=Rolling On The Floor Laughing My A.. Off
SK8=Skate
STATS=Your sex and age
ASL=Age, Sex, Location
THX=Thank You
TTFN=Ta-Ta For Now!
TTYL=Talk To You Later
U=You
U2=You Too
U4E=Yours For Ever
WB=Welcome Back
WTF=What The F...
WTG=Way To Go!
WUF=Where Are You From?
W8=Wait...
7K=Sick:-D Laugher
"""

In [None]:
chat_words_map_dict = {}
chat_words_list = []
for line in chat_words_str.split("\n"):
    if line != "":
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

chat_words_conversion("one minute BRB")

In [None]:
### Spelling Correction ###

In [None]:
!pip install pyspellchecker

In [None]:
from spellchecker import SpellChecker

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

text = "speling correctin"
correct_spellings(text)