## Bag of Words

In [26]:
import pandas as pd
import numpy as np

In [27]:
msg=pd.read_csv("SMSSpamCollection", delimiter="\t",names=["label","message"])


In [28]:
msg.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [29]:
## Data Cleaning and Preprocessing
import nltk
import re
from nltk.corpus import stopwords


In [30]:
# changing the words to lower case
msg['message'] = msg['message'].str.lower()

In [14]:
msg.head()

Unnamed: 0,label,message
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


In [15]:
# Removing Urls
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

In [16]:
msg['message'] = msg['message'].apply(remove_url)

In [17]:
# Removing HTML tags
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [18]:
msg['message'] = msg['message'].apply(remove_html_tags)

In [19]:
# Removing Punctuations
import string
punct = string.punctuation
def remove_punct(text):
    return text.translate(str.maketrans('', '', punct))

In [20]:
msg['message'] = msg['message'].apply(remove_punct)

In [21]:
# Removing Short Convo / Chat Words
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "B4N": "Bye For Now",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "ILU: I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don't care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can't stop laughing"
}

In [22]:
def chat_conversion(text):
    new_text = []
    for i in text.split():
        if i.upper() in chat_words:
            new_text.append(chat_words[i.upper()])
        else:
            new_text.append(i)
    return " ".join(new_text)

In [23]:
msg['message'] = msg['message'].apply(chat_conversion)

In [24]:
# Spelling Correction
from textblob import TextBlob
def correct_spell(text):
    return str(TextBlob(text).correct())



In [25]:
msg['message'] = msg['message'].apply(correct_spell)

KeyboardInterrupt: 

In [37]:
# Handling Stop Words
from nltk.corpus import stopwords
stopword = set(stopwords.words('english'))

def remove_stopwords(text):
    new_text = []
    
    for word in text.split():
        if word in stopword:
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [38]:
msg["message"] = msg["message"].apply(remove_stopwords)

In [39]:
# Remove Emojis	
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [40]:
msg['message'] = msg['message'].apply(remove_emoji)

In [41]:
msg.head()

Unnamed: 0,label,message
0,ham,"go jurong point, crazy.. available bugis n ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry 2 wkly comp win fa cup final tkt...
3,ham,u dun say early hor... u c already say...
4,ham,"nah think goes usf, lives around though"


In [42]:
# Stemming
from nltk.stem import PorterStemmer
ps = PorterStemmer()


In [43]:
corpus = []
# just apply stemming i have already done text preprocessing
for i in range(0, len(msg)):
    # Keep only alphabetic characters and split into words
    review = re.sub('[^a-zA-Z]', ' ', msg['message'][i])
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopword]
    review = ' '.join(review)
    corpus.append(review)

In [44]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl week word back like fun still tb ok xxx std chg send rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'winner valu network custom select receivea prize reward claim call claim code kl valid hour',
 'mobil month u r entitl updat latest colour mobil camera free call mobil updat co free',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash pound txt csh send cost p day day tsandc appli repli hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw',
 'search right word thank breather

In [45]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500,binary=True)


In [46]:
X = cv.fit_transform(corpus).toarray()

In [47]:
# display all values of X in form of a table like bag of words
pd.DataFrame(X)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2490,2491,2492,2493,2494,2495,2496,2497,2498,2499
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
X.shape

(5572, 2500)

### N-grams

In [61]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=500,binary=True,ngram_range=(2,2))
X = cv.fit_transform(corpus).toarray()

In [62]:
cv.vocabulary_


{'free entri': 122,
 'rate appli': 330,
 'per request': 301,
 'claim call': 53,
 'call claim': 23,
 'claim code': 54,
 'entitl updat': 108,
 'updat latest': 435,
 'latest colour': 209,
 'free call': 120,
 'call mobil': 31,
 'mobil updat': 260,
 'chanc win': 52,
 'win cash': 481,
 'repli hl': 339,
 'hl info': 183,
 'txt word': 430,
 'dont miss': 98,
 'let know': 215,
 'feel like': 113,
 'mobil charg': 255,
 'repli ye': 341,
 'go home': 145,
 'anyth lor': 3,
 'call repli': 35,
 'nokia mobil': 280,
 'mobil free': 257,
 'free camcord': 121,
 'pleas call': 312,
 'deliveri tomorrow': 92,
 'lt gt': 231,
 'miss call': 252,
 'want go': 464,
 'first time': 118,
 'like lt': 219,
 'sm ac': 373,
 'bx ip': 20,
 'sorri call': 376,
 'call later': 29,
 'later meet': 206,
 'ok come': 289,
 'award bonu': 9,
 'prize call': 326,
 'ur award': 436,
 'hi babe': 180,
 'call free': 25,
 'that cool': 407,
 'hi hi': 182,
 'wen ur': 478,
 'call custom': 24,
 'custom servic': 84,
 'servic repres': 363,
 'repres pm'