### TF-IDF

In [3]:
import pandas as pd
import numpy as np

In [4]:
msg=pd.read_csv("SMSSpamCollection", delimiter="\t",names=["label","message"])


In [5]:
msg.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [6]:
## Data Cleaning and Preprocessing
import nltk
import re
from nltk.corpus import stopwords


In [7]:
# changing the words to lower case
msg['message'] = msg['message'].str.lower()

In [8]:
msg.head()

Unnamed: 0,label,message
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


In [9]:
# Removing Urls
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

In [10]:
msg['message'] = msg['message'].apply(remove_url)

In [11]:
# Removing HTML tags
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [12]:
msg['message'] = msg['message'].apply(remove_html_tags)

In [13]:
# Removing Punctuations
import string
punct = string.punctuation
def remove_punct(text):
    return text.translate(str.maketrans('', '', punct))

In [14]:
msg['message'] = msg['message'].apply(remove_punct)

In [15]:
# Removing Short Convo / Chat Words
chat_words = {
    "AFAIK": "As Far As I Know",
    "AFK": "Away From Keyboard",
    "ASAP": "As Soon As Possible",
    "ATK": "At The Keyboard",
    "ATM": "At The Moment",
    "A3": "Anytime, Anywhere, Anyplace",
    "BAK": "Back At Keyboard",
    "BBL": "Be Back Later",
    "BBS": "Be Back Soon",
    "BFN": "Bye For Now",
    "B4N": "Bye For Now",
    "BRB": "Be Right Back",
    "BRT": "Be Right There",
    "BTW": "By The Way",
    "B4": "Before",
    "B4N": "Bye For Now",
    "CU": "See You",
    "CUL8R": "See You Later",
    "CYA": "See You",
    "FAQ": "Frequently Asked Questions",
    "FC": "Fingers Crossed",
    "FWIW": "For What It's Worth",
    "FYI": "For Your Information",
    "GAL": "Get A Life",
    "GG": "Good Game",
    "GN": "Good Night",
    "GMTA": "Great Minds Think Alike",
    "GR8": "Great!",
    "G9": "Genius",
    "IC": "I See",
    "ICQ": "I Seek you (also a chat program)",
    "ILU": "ILU: I Love You",
    "IMHO": "In My Honest/Humble Opinion",
    "IMO": "In My Opinion",
    "IOW": "In Other Words",
    "IRL": "In Real Life",
    "KISS": "Keep It Simple, Stupid",
    "LDR": "Long Distance Relationship",
    "LMAO": "Laugh My A.. Off",
    "LOL": "Laughing Out Loud",
    "LTNS": "Long Time No See",
    "L8R": "Later",
    "MTE": "My Thoughts Exactly",
    "M8": "Mate",
    "NRN": "No Reply Necessary",
    "OIC": "Oh I See",
    "PITA": "Pain In The A..",
    "PRT": "Party",
    "PRW": "Parents Are Watching",
    "QPSA?": "Que Pasa?",
    "ROFL": "Rolling On The Floor Laughing",
    "ROFLOL": "Rolling On The Floor Laughing Out Loud",
    "ROTFLMAO": "Rolling On The Floor Laughing My A.. Off",
    "SK8": "Skate",
    "STATS": "Your sex and age",
    "ASL": "Age, Sex, Location",
    "THX": "Thank You",
    "TTFN": "Ta-Ta For Now!",
    "TTYL": "Talk To You Later",
    "U": "You",
    "U2": "You Too",
    "U4E": "Yours For Ever",
    "WB": "Welcome Back",
    "WTF": "What The F...",
    "WTG": "Way To Go!",
    "WUF": "Where Are You From?",
    "W8": "Wait...",
    "7K": "Sick:-D Laugher",
    "TFW": "That feeling when",
    "MFW": "My face when",
    "MRW": "My reaction when",
    "IFYP": "I feel your pain",
    "TNTL": "Trying not to laugh",
    "JK": "Just kidding",
    "IDC": "I don't care",
    "ILY": "I love you",
    "IMU": "I miss you",
    "ADIH": "Another day in hell",
    "ZZZ": "Sleeping, bored, tired",
    "WYWH": "Wish you were here",
    "TIME": "Tears in my eyes",
    "BAE": "Before anyone else",
    "FIMH": "Forever in my heart",
    "BSAAW": "Big smile and a wink",
    "BWL": "Bursting with laughter",
    "BFF": "Best friends forever",
    "CSL": "Can't stop laughing"
}

In [16]:
def chat_conversion(text):
    new_text = []
    for i in text.split():
        if i.upper() in chat_words:
            new_text.append(chat_words[i.upper()])
        else:
            new_text.append(i)
    return " ".join(new_text)

In [17]:
msg['message'] = msg['message'].apply(chat_conversion)

In [18]:
# Spelling Correction
from textblob import TextBlob
def correct_spell(text):
    return str(TextBlob(text).correct())



In [None]:
msg['message'] = msg['message'].apply(correct_spell)

KeyboardInterrupt: 

In [19]:
# Handling Stop Words
from nltk.corpus import stopwords
stopword = set(stopwords.words('english'))

def remove_stopwords(text):
    new_text = []
    
    for word in text.split():
        if word in stopword:
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [20]:
msg["message"] = msg["message"].apply(remove_stopwords)

In [21]:
# Remove Emojis	
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [22]:
msg['message'] = msg['message'].apply(remove_emoji)

In [23]:
msg.head()

Unnamed: 0,label,message
0,ham,go jurong point crazy available bugis n gre...
1,ham,ok lar joking wif You oni
2,spam,free entry 2 wkly comp win fa cup final tkt...
3,ham,You dun say early hor You c already say
4,ham,nah dont think goes usf lives around though


In [25]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [26]:
corpus = []
# just apply stemming i have already done text preprocessing
for i in range(0, len(msg)):
    # Keep only alphabetic characters and split into words
    review = re.sub('[^a-zA-Z]', ' ', msg['message'][i])
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if word not in stopword]
    review = ' '.join(review)
    corpus.append(review)

In [27]:
corpus

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif You oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry questionstd txt ratetcs apply',
 'You dun say early hor You c already say',
 'nah dont think go usf life around though',
 'freemsg hey darling week word back id like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner valued network customer selected receivea prize reward claim call claim code kl valid hour',
 'mobile month You r entitled update latest colour mobile camera free call mobile update co free',
 'im gonna home soon dont want talk stuff anymore tonight k ive cried enough today',
 'six chance win cash pound txt csh send cost pday day tsandcs apply reply hl info',
 'urgent week free membership prize jackpot txt word claim tc lccltd

In [55]:
# TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=100,ngram_range=(2,2))


In [67]:
mt=tfidf.fit_transform(corpus).toarray()
# Creating a DataFrame
df = pd.DataFrame(mt)


In [60]:
tfidf.vocabulary_

{'free entry': 26,
 'dont think': 21,
 'claim call': 13,
 'call claim': 3,
 'free call': 25,
 'im gonna': 42,
 'chance win': 12,
 'lor you': 52,
 'laughing out': 48,
 'out loud': 59,
 'let know': 49,
 'please call': 61,
 'you dont': 92,
 'dont know': 20,
 'want go': 87,
 'tears eye': 78,
 'like ltgt': 50,
 'sorry ill': 75,
 'ill call': 40,
 'call later': 8,
 'ur awarded': 82,
 'im home': 43,
 'you call': 91,
 'hi hi': 37,
 'call customer': 4,
 'customer service': 18,
 'po box': 63,
 'trying contact': 80,
 'draw show': 22,
 'prize guaranteed': 67,
 'guaranteed call': 33,
 'valid hr': 85,
 'selected receive': 70,
 'private account': 65,
 'account statement': 0,
 'statement show': 76,
 'show unredeemed': 73,
 'call identifier': 5,
 'identifier code': 39,
 'code expires': 15,
 'urgent mobile': 84,
 'contact you': 17,
 'call landline': 7,
 'wat tears': 88,
 'you know': 94,
 'ur mob': 83,
 'you get': 93,
 'gud ni': 35,
 'new year': 56,
 'send stop': 72,
 'you still': 95,
 'great day': 32,
 '

In [68]:
df.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.709826,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
