### Objective of this kernel is to understand the various text preprocessing steps with code examples.

Some of the common text preprocessing / cleaning steps are:

- Lower casing
- Removal of Punctuations
- Removal of Stopwords
- Removal of Frequent words
- Removal of Rare words
- Stemming
- Lemmatization
- Removal of URLs
- Removal of HTML tags
- Chat words conversion
- Spelling correction

In [50]:
import pandas as pd
import numpy as np
import re
import nltk
import spacy
import string



In [51]:
data = pd.read_csv('sample.csv')
data.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,@AppleSupport causing the reply to be disregar...,119236.0,
1,119238,ChaseSupport,False,Wed Oct 11 13:25:49 +0000 2017,@105835 Your business means a lot to us. Pleas...,,119239.0
2,119239,105835,True,Wed Oct 11 13:00:09 +0000 2017,@76328 I really hope you all change but I'm su...,119238.0,
3,119240,VirginTrains,False,Tue Oct 10 15:16:08 +0000 2017,@105836 LiveChat is online at the moment - htt...,119241.0,119242.0
4,119241,105836,True,Tue Oct 10 15:17:21 +0000 2017,@VirginTrains see attached error message. I've...,119243.0,119240.0


In [52]:
data.shape

(93, 7)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93 entries, 0 to 92
Data columns (total 7 columns):
tweet_id                   93 non-null int64
author_id                  93 non-null object
inbound                    93 non-null bool
created_at                 93 non-null object
text                       93 non-null object
response_tweet_id          65 non-null object
in_response_to_tweet_id    68 non-null float64
dtypes: bool(1), float64(1), int64(1), object(4)
memory usage: 4.5+ KB


In [10]:
data['text'][0:5].values

array(['@AppleSupport causing the reply to be disregarded and the tapped notification under the keyboard is opened😡😡😡',
       '@105835 Your business means a lot to us. Please DM your name, zip code and additional details about your concern. ^RR https://t.co/znUu1VJn9r',
       "@76328 I really hope you all change but I'm sure you won't! Because you don't have to!",
       '@105836 LiveChat is online at the moment - https://t.co/SY94VtU8Kq or contact 03331 031 031 option 1, 4, 3 (Leave a message) to request a call back',
       "@VirginTrains see attached error message. I've tried leaving a voicemail several times in the past week https://t.co/NxVZjlYx1k"],
      dtype=object)

### Lower Casing

In [11]:
data_lower = data.copy()

In [12]:
data_lower['text'] = data_lower['text'].str.lower()
data_lower.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,@applesupport causing the reply to be disregar...,119236.0,
1,119238,ChaseSupport,False,Wed Oct 11 13:25:49 +0000 2017,@105835 your business means a lot to us. pleas...,,119239.0
2,119239,105835,True,Wed Oct 11 13:00:09 +0000 2017,@76328 i really hope you all change but i'm su...,119238.0,
3,119240,VirginTrains,False,Tue Oct 10 15:16:08 +0000 2017,@105836 livechat is online at the moment - htt...,119241.0,119242.0
4,119241,105836,True,Tue Oct 10 15:17:21 +0000 2017,@virgintrains see attached error message. i've...,119243.0,119240.0


### Removal of Punctuation

In [13]:
data_punc = data.copy()

In [14]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [15]:
def rem_punc(text):
    for i in text:
        if i in list(string.punctuation):
            text = text.replace(i, ' ')
    return text

In [16]:
rem_punc("Hello, Boss!")

'Hello  Boss '

In [17]:
data_punc['text'] = data['text'].apply(rem_punc)

In [18]:
data_punc.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,AppleSupport causing the reply to be disregar...,119236.0,
1,119238,ChaseSupport,False,Wed Oct 11 13:25:49 +0000 2017,105835 Your business means a lot to us Pleas...,,119239.0
2,119239,105835,True,Wed Oct 11 13:00:09 +0000 2017,76328 I really hope you all change but I m su...,119238.0,
3,119240,VirginTrains,False,Tue Oct 10 15:16:08 +0000 2017,105836 LiveChat is online at the moment htt...,119241.0,119242.0
4,119241,105836,True,Tue Oct 10 15:17:21 +0000 2017,VirginTrains see attached error message I ve...,119243.0,119240.0


### Removal of stopwords

In [19]:
data_stop = data.copy()

In [20]:
from nltk.corpus import stopwords

In [21]:
stop_words = stopwords.words('English')
print(stop_words 
     )

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [22]:
len(stop_words)

179

In [23]:
def rem_stop(text):
    mylist = text.split()
    [mylist.remove(i) for i in mylist  if i in stop_words]
    return " ".join(mylist)
        

In [24]:
rem_stop('Hello are you waiting in the lobby are')

'Hello you waiting the lobby'

In [25]:
data_stop['text'] = data_stop['text'].apply(rem_stop)  # Removed Stopwords
data_stop['text'] = data_stop['text'].apply(rem_punc)  # Removed Punctuation
data_stop.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,AppleSupport causing reply be disregarded the...,119236.0,
1,119238,ChaseSupport,False,Wed Oct 11 13:25:49 +0000 2017,105835 Your business means lot us Please DM ...,,119239.0
2,119239,105835,True,Wed Oct 11 13:00:09 +0000 2017,76328 I really hope all change I m sure won t...,119238.0,
3,119240,VirginTrains,False,Tue Oct 10 15:16:08 +0000 2017,105836 LiveChat online the moment https t...,119241.0,119242.0
4,119241,105836,True,Tue Oct 10 15:17:21 +0000 2017,VirginTrains see attached error message I ve...,119243.0,119240.0


### Remove Frequent Words

In [26]:
data_frequent = data.copy()


In [27]:
from collections import Counter

In [28]:
# Joining all text from 'text' column

a = list(data_frequent['text'].values)
a = " ".join(a)
a = a.split()
a = Counter(a)
a.most_common(10)

[('the', 57),
 ('to', 39),
 ('a', 35),
 ('I', 34),
 ('you', 31),
 ('and', 30),
 ('for', 26),
 ('your', 21),
 ('is', 20),
 ('this', 20)]

In [29]:
freq_words = a.most_common(10)

In [30]:
freq_words

[('the', 57),
 ('to', 39),
 ('a', 35),
 ('I', 34),
 ('you', 31),
 ('and', 30),
 ('for', 26),
 ('your', 21),
 ('is', 20),
 ('this', 20)]

In [31]:
def rem_freq(text):
    mylist = text.split()
    for i in mylist:
        for k, v in freq_words:
            if i == k:
                mylist.remove(i)
        
    return " ".join(mylist)

In [32]:
rem_freq('I really hope change I sure the Because')

'really hope change sure Because'

In [33]:
data_frequent['text'] = data_frequent['text'].apply(rem_freq)
data_frequent.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,@AppleSupport causing reply be disregarded tap...,119236.0,
1,119238,ChaseSupport,False,Wed Oct 11 13:25:49 +0000 2017,@105835 Your business means lot us. Please DM ...,,119239.0
2,119239,105835,True,Wed Oct 11 13:00:09 +0000 2017,@76328 really hope all change but I'm sure won...,119238.0,
3,119240,VirginTrains,False,Tue Oct 10 15:16:08 +0000 2017,@105836 LiveChat online at moment - https://t....,119241.0,119242.0
4,119241,105836,True,Tue Oct 10 15:17:21 +0000 2017,@VirginTrains see attached error message. I've...,119243.0,119240.0


### Removal of Rare Words

In [34]:
data_rare = data.copy()

In [35]:
rare_words = a.most_common()[-10:]
rare_words

[('slowdown.', 1),
 ('keen', 1),
 ('thin', 1),
 ('green', 1),
 ('line', 1),
 ('https://t.co/9281OKEebk', 1),
 ('including', 1),
 ('browser', 1),
 ('log', 1),
 ('Lee', 1)]

In [36]:
def rem_rare(text):
    mylist = text.split()
    for k, v in rare_words:
        for item in mylist:
            if k == item:
                mylist.remove(k)
    return " ".join(mylist)

In [37]:
rem_rare('Lee log in log browser it was green')

'in it was'

In [38]:
data_rare['text'] = data_rare['text'].apply(rem_rare)
data_rare.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,@AppleSupport causing the reply to be disregar...,119236.0,
1,119238,ChaseSupport,False,Wed Oct 11 13:25:49 +0000 2017,@105835 Your business means a lot to us. Pleas...,,119239.0
2,119239,105835,True,Wed Oct 11 13:00:09 +0000 2017,@76328 I really hope you all change but I'm su...,119238.0,
3,119240,VirginTrains,False,Tue Oct 10 15:16:08 +0000 2017,@105836 LiveChat is online at the moment - htt...,119241.0,119242.0
4,119241,105836,True,Tue Oct 10 15:17:21 +0000 2017,@VirginTrains see attached error message. I've...,119243.0,119240.0


### Stemming

In [39]:
data_stem = data.copy()


In [40]:
from nltk.stem.porter  import PorterStemmer

In [41]:
stemmer = PorterStemmer()

In [42]:
def stem_words(text):
    mylist = [stemmer.stem(word) for word in text.split()]
    mytext = " ".join(mylist)
    return mytext

In [43]:
stem_words('i was going to play')

'i wa go to play'

In [44]:
data_stem['stem_text'] = data_stem['text'].apply(stem_words)
data_stem.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,stem_text
0,119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,@AppleSupport causing the reply to be disregar...,119236.0,,@applesupport caus the repli to be disregard a...
1,119238,ChaseSupport,False,Wed Oct 11 13:25:49 +0000 2017,@105835 Your business means a lot to us. Pleas...,,119239.0,@105835 your busi mean a lot to us. pleas DM y...
2,119239,105835,True,Wed Oct 11 13:00:09 +0000 2017,@76328 I really hope you all change but I'm su...,119238.0,,@76328 I realli hope you all chang but i'm sur...
3,119240,VirginTrains,False,Tue Oct 10 15:16:08 +0000 2017,@105836 LiveChat is online at the moment - htt...,119241.0,119242.0,@105836 livechat is onlin at the moment - http...
4,119241,105836,True,Tue Oct 10 15:17:21 +0000 2017,@VirginTrains see attached error message. I've...,119243.0,119240.0,@virgintrain see attach error message. i'v tri...


### Lemmatization

In [45]:
data_lemma = data.copy()

In [46]:
from nltk.stem import WordNetLemmatizer

In [47]:
lemma = WordNetLemmatizer()

In [48]:
def lemmatize_word(text):
    mylist = [lemma.lemmatize(word, 'n') for word in text.split() ]      # Lemmatize Nouns
    mylist = [lemma.lemmatize(word, pos = 'v') for word in mylist ]      # Lemmatize Verbs
    mytext = " ".join(mylist)
    return mytext

In [53]:
lemmatize_word('I saw a cat running faster than a dog. Beginning to stripes')

'I saw a cat run faster than a dog. Beginning to stripe'

In [54]:
data_lemma['text_lemma'] = data_lemma['text'].apply(lemmatize_word)
data_lemma.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,text_lemma
0,119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,@AppleSupport causing the reply to be disregar...,119236.0,,@AppleSupport cause the reply to be disregard ...
1,119238,ChaseSupport,False,Wed Oct 11 13:25:49 +0000 2017,@105835 Your business means a lot to us. Pleas...,,119239.0,@105835 Your business mean a lot to us. Please...
2,119239,105835,True,Wed Oct 11 13:00:09 +0000 2017,@76328 I really hope you all change but I'm su...,119238.0,,@76328 I really hope you all change but I'm su...
3,119240,VirginTrains,False,Tue Oct 10 15:16:08 +0000 2017,@105836 LiveChat is online at the moment - htt...,119241.0,119242.0,@105836 LiveChat be online at the moment - htt...
4,119241,105836,True,Tue Oct 10 15:17:21 +0000 2017,@VirginTrains see attached error message. I've...,119243.0,119240.0,@VirginTrains see attach error message. I've t...


#### Using  Wordnet

In [55]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [56]:
lemmatizer = WordNetLemmatizer()

In [57]:
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    mylist = [lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text]
    return " ".join(mylist)

In [58]:
lemmatize_words('I saw a cat running faster than a dog. beginning to stripes')


'I saw a cat run fast than a dog. begin to strip'

In [59]:
data_lemma['lemma_text2'] = data_lemma['text'].apply(lemmatize_word)
data_lemma.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,text_lemma,lemma_text2
0,119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,@AppleSupport causing the reply to be disregar...,119236.0,,@AppleSupport cause the reply to be disregard ...,@AppleSupport cause the reply to be disregard ...
1,119238,ChaseSupport,False,Wed Oct 11 13:25:49 +0000 2017,@105835 Your business means a lot to us. Pleas...,,119239.0,@105835 Your business mean a lot to us. Please...,@105835 Your business mean a lot to us. Please...
2,119239,105835,True,Wed Oct 11 13:00:09 +0000 2017,@76328 I really hope you all change but I'm su...,119238.0,,@76328 I really hope you all change but I'm su...,@76328 I really hope you all change but I'm su...
3,119240,VirginTrains,False,Tue Oct 10 15:16:08 +0000 2017,@105836 LiveChat is online at the moment - htt...,119241.0,119242.0,@105836 LiveChat be online at the moment - htt...,@105836 LiveChat be online at the moment - htt...
4,119241,105836,True,Tue Oct 10 15:17:21 +0000 2017,@VirginTrains see attached error message. I've...,119243.0,119240.0,@VirginTrains see attach error message. I've t...,@VirginTrains see attach error message. I've t...


### Removal of URLs

In [81]:
data_url = data.copy()

In [106]:
def rem_url(text):
    pattern = re.compile(r'https?://\S* | www\.\S+')
    return pattern.sub(r' ', text )

In [107]:
rem_url('http://gmail is a good site than www.yahoo.com ')

' is a good site than  '

In [108]:
text = "Driverless AI NLP blog post on https://www.h2o.ai/blog/detecting-sarcasm-is-difficult-but-ai-may-have-an-answer "
rem_url(text)

'Driverless AI NLP blog post on  '

In [109]:
data_url['url_text'] = data_url['text'].apply(rem_url)

In [110]:
data_url.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,url_text
0,119237,105834,True,Wed Oct 11 06:55:44 +0000 2017,@AppleSupport causing the reply to be disregar...,119236.0,,@AppleSupport causing the reply to be disregar...
1,119238,ChaseSupport,False,Wed Oct 11 13:25:49 +0000 2017,@105835 Your business means a lot to us. Pleas...,,119239.0,@105835 Your business means a lot to us. Pleas...
2,119239,105835,True,Wed Oct 11 13:00:09 +0000 2017,@76328 I really hope you all change but I'm su...,119238.0,,@76328 I really hope you all change but I'm su...
3,119240,VirginTrains,False,Tue Oct 10 15:16:08 +0000 2017,@105836 LiveChat is online at the moment - htt...,119241.0,119242.0,@105836 LiveChat is online at the moment - or...
4,119241,105836,True,Tue Oct 10 15:17:21 +0000 2017,@VirginTrains see attached error message. I've...,119243.0,119240.0,@VirginTrains see attached error message. I've...


### Removal of HTML Tags

In [111]:
data_htmltag = data.copy()

In [114]:
def rem_html(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r' ', text)

    
text = """<div>
<h1> H2O</h1>
<p> AutoML</p>
<a href="https://www.h2o.ai/products/h2o-driverless-ai/"> Driverless AI</a>
</div>"""

In [116]:
print(rem_html(text))

 
  H2O 
  AutoML 
  Driverless AI 
 


In [117]:
from bs4 import BeautifulSoup

In [118]:
def rem_tag(text):
    return BeautifulSoup(text, 'lxml').text

In [120]:
print(rem_tag(text))


 H2O
 AutoML
 Driverless AI



### Chat Word Conversion

In [136]:
chat_words_str = """
AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace
BAK=Back At Keyboard
BBL=Be Back Later
BBS=Be Back Soon
BFN=Bye For Now
B4N=Bye For Now
BRB=Be Right Back
BRT=Be Right There
BTW=By The Way
B4=Before
B4N=Bye For Now
CU=See You
CUL8R=See You Later
CYA=See You
FAQ=Frequently Asked Questions
FC=Fingers Crossed
FWIW=For What It's Worth
FYI=For Your Information
GAL=Get A Life
GG=Good Game
GN=Good Night
GMTA=Great Minds Think Alike
GR8=Great!
G9=Genius
IC=I See
ICQ=I Seek you (also a chat program)
ILU=ILU: I Love You
IMHO=In My Honest/Humble Opinion
IMO=In My Opinion
IOW=In Other Words
IRL=In Real Life
KISS=Keep It Simple, Stupid
LDR=Long Distance Relationship
LMAO=Laugh My A.. Off
LOL=Laughing Out Loud
LTNS=Long Time No See
L8R=Later
MTE=My Thoughts Exactly
M8=Mate
NRN=No Reply Necessary
OIC=Oh I See
PITA=Pain In The A..
PRT=Party
PRW=Parents Are Watching
ROFL=Rolling On The Floor Laughing
ROFLOL=Rolling On The Floor Laughing Out Loud
ROTFLMAO=Rolling On The Floor Laughing My A.. Off
SK8=Skate
STATS=Your sex and age
ASL=Age, Sex, Location
THX=Thank You
THNX=Thanks
TTFN=Ta-Ta For Now!
TTYL=Talk To You Later
U=You
U2=You Too
U4E=Yours For Ever
WB=Welcome Back
WTF=What The F...
WTG=Way To Go!
WUF=Where Are You From?
W8=Wait...
7K=Sick:-D Laugher
"""

In [140]:
mylist = chat_words_str.split('\n')
mylist

['',
 'AFAIK=As Far As I Know',
 'AFK=Away From Keyboard',
 'ASAP=As Soon As Possible',
 'ATK=At The Keyboard',
 'ATM=At The Moment',
 'A3=Anytime, Anywhere, Anyplace',
 'BAK=Back At Keyboard',
 'BBL=Be Back Later',
 'BBS=Be Back Soon',
 'BFN=Bye For Now',
 'B4N=Bye For Now',
 'BRB=Be Right Back',
 'BRT=Be Right There',
 'BTW=By The Way',
 'B4=Before',
 'B4N=Bye For Now',
 'CU=See You',
 'CUL8R=See You Later',
 'CYA=See You',
 'FAQ=Frequently Asked Questions',
 'FC=Fingers Crossed',
 "FWIW=For What It's Worth",
 'FYI=For Your Information',
 'GAL=Get A Life',
 'GG=Good Game',
 'GN=Good Night',
 'GMTA=Great Minds Think Alike',
 'GR8=Great!',
 'G9=Genius',
 'IC=I See',
 'ICQ=I Seek you (also a chat program)',
 'ILU=ILU: I Love You',
 'IMHO=In My Honest/Humble Opinion',
 'IMO=In My Opinion',
 'IOW=In Other Words',
 'IRL=In Real Life',
 'KISS=Keep It Simple, Stupid',
 'LDR=Long Distance Relationship',
 'LMAO=Laugh My A.. Off',
 'LOL=Laughing Out Loud',
 'LTNS=Long Time No See',
 'L8R=Later'

In [141]:
mylist = mylist[1:-1]
mylist = [i.lower().split('=') for i in mylist]
mydict = {}
for j in mylist:
    mydict[j[0]] = j[1]
mydict

{'afaik': 'as far as i know',
 'afk': 'away from keyboard',
 'asap': 'as soon as possible',
 'atk': 'at the keyboard',
 'atm': 'at the moment',
 'a3': 'anytime, anywhere, anyplace',
 'bak': 'back at keyboard',
 'bbl': 'be back later',
 'bbs': 'be back soon',
 'bfn': 'bye for now',
 'b4n': 'bye for now',
 'brb': 'be right back',
 'brt': 'be right there',
 'btw': 'by the way',
 'b4': 'before',
 'cu': 'see you',
 'cul8r': 'see you later',
 'cya': 'see you',
 'faq': 'frequently asked questions',
 'fc': 'fingers crossed',
 'fwiw': "for what it's worth",
 'fyi': 'for your information',
 'gal': 'get a life',
 'gg': 'good game',
 'gn': 'good night',
 'gmta': 'great minds think alike',
 'gr8': 'great!',
 'g9': 'genius',
 'ic': 'i see',
 'icq': 'i seek you (also a chat program)',
 'ilu': 'ilu: i love you',
 'imho': 'in my honest/humble opinion',
 'imo': 'in my opinion',
 'iow': 'in other words',
 'irl': 'in real life',
 'kiss': 'keep it simple, stupid',
 'ldr': 'long distance relationship',
 'lm

In [142]:
list(mydict.keys())

['afaik',
 'afk',
 'asap',
 'atk',
 'atm',
 'a3',
 'bak',
 'bbl',
 'bbs',
 'bfn',
 'b4n',
 'brb',
 'brt',
 'btw',
 'b4',
 'cu',
 'cul8r',
 'cya',
 'faq',
 'fc',
 'fwiw',
 'fyi',
 'gal',
 'gg',
 'gn',
 'gmta',
 'gr8',
 'g9',
 'ic',
 'icq',
 'ilu',
 'imho',
 'imo',
 'iow',
 'irl',
 'kiss',
 'ldr',
 'lmao',
 'lol',
 'ltns',
 'l8r',
 'mte',
 'm8',
 'nrn',
 'oic',
 'pita',
 'prt',
 'prw',
 'rofl',
 'roflol',
 'rotflmao',
 'sk8',
 'stats',
 'asl',
 'thx',
 'thnx',
 'ttfn',
 'ttyl',
 'u',
 'u2',
 'u4e',
 'wb',
 'wtf',
 'wtg',
 'wuf',
 'w8',
 '7k']

In [150]:
def chat_word_conversion(text):
    mylist = text.split()
    mylist = [i.lower() for i in mylist]
    mylist = [  mydict[j]  if j in list(mydict.keys()) else j for j in mylist ]
    return ' '.join(mylist)

In [151]:
chat_word_conversion('I was happy and Lol')

'i was happy and laughing out loud'

In [154]:
chat_word_conversion("imo this is awesome, one minute BRB")

'in my opinion this is awesome, one minute be right back'

### Spelling Correction

In [155]:
from spellchecker import SpellChecker

In [156]:
spell = SpellChecker()

def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    
    return " ".join(corrected_text)
        
text = "speling correctin"
correct_spellings(text)

'spelling correction'

In [161]:
spell.unknown('hello bosss how ar you'.split())

{'bosss'}

In [163]:
spell.unknown(['Helo'])

{'helo'}

In [164]:
text = "thnks for readin the notebook"
correct_spellings(text)

'thanks for reading the notebook'

In [None]:
Thanks Kaggle and SRK The Great :-)