In [1]:
import nltk

In [None]:
#nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


# Tokenization Example

In [3]:
from nltk.tokenize import word_tokenize
input = "I am starting with NLP learning"
print(word_tokenize(input))

['I', 'am', 'starting', 'with', 'NLP', 'learning']


# Reading Text

In [4]:
#Can be done with open("").read()
import pandas as pd

data = pd.read_csv("SMSSpamCollection", sep='\t', header=None)
data.columns = ['Label','SMS']
data.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Exploring the Dataset

In [5]:
data.describe()

Unnamed: 0,Label,SMS
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [6]:
print("Size of data is - ",len(data))
print("Size of spam = ",len(data[data['Label']=='spam']))
print("Size of ham = ",len(data[data['Label']=='ham']))

Size of data is -  5572
Size of spam =  747
Size of ham =  4825


In [7]:
print("Nulls in label = ",data['Label'].isnull().sum())
print("Nulls in SMS = ",data['SMS'].isnull().sum())

Nulls in label =  0
Nulls in SMS =  0


# Removing Punctuation

In [8]:
pd.set_option('display.max_colwidth',100)
data.head()

Unnamed: 0,Label,SMS
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [9]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
def removePunc(sentence):
    return "".join([s for s in sentence if s not in string.punctuation])

In [11]:
data['SMS_Clean'] = data['SMS'].apply(lambda x:removePunc(x))
data.head()

Unnamed: 0,Label,SMS,SMS_Clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though


# Tokenization

In [12]:
import re

def tokenize(text):
    return re.split('\W+', text.lower())

In [13]:
data['SMS_Tokenized'] = data['SMS_Clean'].apply(lambda x:tokenize(x))
data.head()

Unnamed: 0,Label,SMS,SMS_Clean,SMS_Tokenized
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, ci..."
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]"


# Remove Stop Words

In [14]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [15]:
def remove_stopwords(tockenized_text):
    return [word for word in tockenized_text if word not in stopwords]

In [16]:
data['SMS_NoStopWords'] = data['SMS_Tokenized'].apply(lambda x : remove_stopwords(x))
data.head(5)

Unnamed: 0,Label,SMS,SMS_Clean,SMS_Tokenized,SMS_NoStopWords
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go until jurong point crazy Available only in bugis n great world la e buffet Cine there got amo...,"[go, until, jurong, point, crazy, available, only, in, bugis, n, great, world, la, e, buffet, ci...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]"
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, joking, wif, u, oni]","[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005 Text FA to 87121 to receive e...,"[free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv..."
3,ham,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, then, say]","[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think he goes to usf he lives around here though,"[nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]","[nah, dont, think, goes, usf, lives, around, though]"


# Stemming

In [17]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [18]:
def stemWord(tokenized_text):
    return [ps.stem(word) for word in tokenized_text]

In [19]:
data.drop(columns=['SMS_Clean','SMS_Tokenized'], inplace=True)

In [20]:
data['SMS_Stem']=data['SMS_NoStopWords'].apply(lambda x:stemWord(x))
data.head(5)

Unnamed: 0,Label,SMS,SMS_NoStopWords,SMS_Stem
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]","[go, jurong, point, crazi, avail, bugi, n, great, world, la, e, buffet, cine, got, amor, wat]"
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]"


# Lemmatization

In [21]:
wn = nltk.WordNetLemmatizer()

In [22]:
def lemmatization(tockenized_text):
    return [wn.lemmatize(word) for word in tockenized_text]

In [23]:
data['SMS_Lemma'] = data['SMS_NoStopWords'].apply(lambda x:lemmatization(x))
data.tail(5)

Unnamed: 0,Label,SMS,SMS_NoStopWords,SMS_Stem,SMS_Lemma
5567,spam,This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy...,"[2nd, time, tried, 2, contact, u, u, 750, pound, prize, 2, claim, easy, call, 087187272008, now1...","[2nd, time, tri, 2, contact, u, u, 750, pound, prize, 2, claim, easi, call, 087187272008, now1, ...","[2nd, time, tried, 2, contact, u, u, 750, pound, prize, 2, claim, easy, call, 087187272008, now1..."
5568,ham,Will ü b going to esplanade fr home?,"[ü, b, going, esplanade, fr, home]","[ü, b, go, esplanad, fr, home]","[ü, b, going, esplanade, fr, home]"
5569,ham,"Pity, * was in mood for that. So...any other suggestions?","[pity, mood, soany, suggestions]","[piti, mood, soani, suggest]","[pity, mood, soany, suggestion]"
5570,ham,The guy did some bitching but I acted like i'd be interested in buying something else next week ...,"[guy, bitching, acted, like, id, interested, buying, something, else, next, week, gave, us, free]","[guy, bitch, act, like, id, interest, buy, someth, els, next, week, gave, us, free]","[guy, bitching, acted, like, id, interested, buying, something, else, next, week, gave, u, free]"
5571,ham,Rofl. Its true to its name,"[rofl, true, name]","[rofl, true, name]","[rofl, true, name]"


# Count Vectorization

In [24]:
#This is just an exampe to understand CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

corpus = ["This is a sentence is",
         "This is another sentence",
         "Third document is here"]

# Learns a vocabulary of all tokens in raw document
X = cv.fit(corpus)
print("Vocabulary word dict from raw--> ",X.vocabulary_)
print("Feature Names-->",X.get_feature_names())

Vocabulary word dict from raw-->  {'this': 6, 'is': 3, 'sentence': 4, 'another': 0, 'third': 5, 'document': 1, 'here': 2}
Feature Names--> ['another', 'document', 'here', 'is', 'sentence', 'third', 'this']


In [25]:
#Transforms documents to document-term-matrix
X = cv.transform(corpus)
#Learns a vocabulary of all tokens in raw document and Transforms documents to document-term-matrix
#X = cv.fit_transform()

print("Shape--> ",X.shape)
print("---------------------------------")
print("Document term Matrix-->\n",X.toarray())
print("---------------------------------")
print("Sparse Matrix of shape (n_samples, n_features)-->\n",X)

Shape-->  (3, 7)
---------------------------------
Document term Matrix-->
 [[0 0 0 2 1 0 1]
 [1 0 0 1 1 0 1]
 [0 1 1 1 0 1 0]]
---------------------------------
Sparse Matrix of shape (n_samples, n_features)-->
   (0, 3)	2
  (0, 4)	1
  (0, 6)	1
  (1, 0)	1
  (1, 3)	1
  (1, 4)	1
  (1, 6)	1
  (2, 1)	1
  (2, 2)	1
  (2, 3)	1
  (2, 5)	1


# Cleaning Text

In [28]:
def clean_text(txt):
    noPuncTxt = "".join([s for s in txt if s not in string.punctuation])
    splitTxt = re.split('\W+', noPuncTxt.lower())
    noStpWord = [word for word in splitTxt if word not in stopwords]
    return noStpWord

# Count Vectorizer of SMS Spam Dataset

In [29]:
vect = CountVectorizer(analyzer=clean_text)
X = vect.fit_transform(data['SMS'])
print("Shape--> ",X.shape)

Shape-->  (5572, 9446)


In [32]:
df = pd.DataFrame(X.toarray(),columns=vect.get_feature_names())

In [33]:
df.describe()

Unnamed: 0,Unnamed: 1,0,008704050406,0089my,0121,01223585236,01223585334,0125698789,02,020603,...,zindgi,zoe,zogtorius,zoom,zouk,zyada,é,ü,üll,〨ud
count,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,...,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0,5572.0
mean,0.075915,0.000538,0.000359,0.000179,0.000179,0.000179,0.000359,0.000179,0.000179,0.000718,...,0.000179,0.000359,0.000179,0.000179,0.000179,0.000179,0.000538,0.03033,0.000538,0.000179
std,0.270253,0.023199,0.018944,0.013397,0.013397,0.013397,0.018944,0.013397,0.013397,0.026786,...,0.013397,0.018944,0.013397,0.013397,0.013397,0.013397,0.04019,0.214312,0.023199,0.013397
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,3.0,4.0,1.0,1.0


In [34]:
df.head()

Unnamed: 0,Unnamed: 1,0,008704050406,0089my,0121,01223585236,01223585334,0125698789,02,020603,...,zindgi,zoe,zogtorius,zoom,zouk,zyada,é,ü,üll,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# N-Grams Vectorization

In [42]:
#This is just an exampe to understand NgramsVectorizer
from sklearn.feature_extraction.text import CountVectorizer
#(2,2) --> will give only bigrams, (2,4) --> Will look for all bi, tri and 4-grams
cv2 = CountVectorizer(ngram_range=(2,2))

corpus = ["This is a sentence is",
         "This is another sentence",
         "Third document is here"]

# Learns a vocabulary of all tokens in raw document
X2 = cv2.fit_transform(corpus)
print("Vocabulary word dict from raw--> ",cv2.vocabulary_)
print("Feature Names-->",cv2.get_feature_names())

Vocabulary word dict from raw-->  {'this is': 7, 'is sentence': 4, 'sentence is': 5, 'is another': 2, 'another sentence': 0, 'third document': 6, 'document is': 1, 'is here': 3}
Feature Names--> ['another sentence', 'document is', 'is another', 'is here', 'is sentence', 'sentence is', 'third document', 'this is']


In [43]:
print("Shape--> ",X2.shape)
print("---------------------------------")
print("Document term Matrix-->\n",X2.toarray())
print("---------------------------------")
print("Sparse Matrix of shape (n_samples, n_features)-->\n",X2)

Shape-->  (3, 8)
---------------------------------
Document term Matrix-->
 [[0 0 0 0 1 1 0 1]
 [1 0 1 0 0 0 0 1]
 [0 1 0 1 0 0 1 0]]
---------------------------------
Sparse Matrix of shape (n_samples, n_features)-->
   (0, 7)	1
  (0, 4)	1
  (0, 5)	1
  (1, 7)	1
  (1, 2)	1
  (1, 0)	1
  (2, 6)	1
  (2, 1)	1
  (2, 3)	1


In [38]:
#This requires complete sentences to work with, hence there would be sloght modification in the cleaning function
#Instead of getting a list of all words we would be preserving the sentence with a join in stopwords
def cleaning_text(txt):
    noPuncTxt = "".join([s for s in txt if s not in string.punctuation])
    splitTxt = re.split('\W+', noPuncTxt.lower())
    noStpWord = " ".join([word for word in splitTxt if word not in stopwords])
    return noStpWord

data['SMS_Clean'] = data['SMS'].apply(lambda x: cleaning_text(x))
data.head()

Unnamed: 0,Label,SMS,SMS_NoStopWords,SMS_Stem,SMS_Lemma,SMS_Clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]","[go, jurong, point, crazi, avail, bugi, n, great, world, la, e, buffet, cine, got, amor, wat]","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]",go jurong point crazy available bugis n great world la e buffet cine got amore wat
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]","[ok, lar, joking, wif, u, oni]",ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...",free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry questions...
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]","[u, dun, say, early, hor, u, c, already, say]",u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]","[nah, dont, think, go, usf, life, around, though]",nah dont think goes usf lives around though


# NGrams Vectorizer of SMS Spam Dataset

In [46]:
data_sample = data[0:10]
data_sample.head()

Unnamed: 0,Label,SMS,SMS_NoStopWords,SMS_Stem,SMS_Lemma,SMS_Clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]","[go, jurong, point, crazi, avail, bugi, n, great, world, la, e, buffet, cine, got, amor, wat]","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]",go jurong point crazy available bugis n great world la e buffet cine got amore wat
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]","[ok, lar, joking, wif, u, oni]",ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...",free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry questions...
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]","[u, dun, say, early, hor, u, c, already, say]",u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]","[nah, dont, think, go, usf, life, around, though]",nah dont think goes usf lives around though


In [47]:
vect2 = CountVectorizer(ngram_range=(2,2))
X3 = vect2.fit_transform(data_sample['SMS_Clean'])
print("Shape--> ",X3.shape)

Shape-->  (10, 120)


In [48]:
df = pd.DataFrame(X3.toarray(),columns=vect2.get_feature_names())
df.head()

Unnamed: 0,09061701461 claim,11 months,12 hours,150 rcv,2005 text,21st may,87121 receive,900 prize,aids patent,already say,...,valued network,vettam set,weeks word,wif oni,win fa,winner valued,wkly comp,word back,world la,xxx std
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,1,1,1,0,0,0,...,0,0,0,0,1,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# TF-IDF Vectorizer

In [50]:
#This is just an exampe to understand TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
cv3 = TfidfVectorizer()

corpus = ["This is a sentence is",
         "This is another sentence",
         "Third document is here"]

# Learns a vocabulary of all tokens in raw document
X3 = cv3.fit_transform(corpus)
print("Vocabulary word dict from raw--> ",cv3.vocabulary_)
print("Feature Names-->",cv3.get_feature_names())

Vocabulary word dict from raw-->  {'this': 6, 'is': 3, 'sentence': 4, 'another': 0, 'third': 5, 'document': 1, 'here': 2}
Feature Names--> ['another', 'document', 'here', 'is', 'sentence', 'third', 'this']


In [51]:
print("Shape--> ",X3.shape)
print("---------------------------------")
print("Document term Matrix-->\n",X3.toarray())
print("---------------------------------")
print("Sparse Matrix of shape (n_samples, n_features)-->\n",X3)

Shape-->  (3, 7)
---------------------------------
Document term Matrix-->
 [[0.         0.         0.         0.73941068 0.47606294 0.
  0.47606294]
 [0.63174505 0.         0.         0.37311881 0.4804584  0.
  0.4804584 ]
 [0.         0.54645401 0.54645401 0.32274454 0.         0.54645401
  0.        ]]
---------------------------------
Sparse Matrix of shape (n_samples, n_features)-->
   (0, 4)	0.47606293927679294
  (0, 3)	0.7394106813498715
  (0, 6)	0.47606293927679294
  (1, 0)	0.6317450542765208
  (1, 4)	0.4804583972923858
  (1, 3)	0.3731188059313277
  (1, 6)	0.4804583972923858
  (2, 2)	0.546454011634009
  (2, 1)	0.546454011634009
  (2, 5)	0.546454011634009
  (2, 3)	0.3227445421804912


In [52]:
sampledf = pd.DataFrame(X3.toarray(),columns=cv3.get_feature_names())
print(sampledf)

    another  document      here        is  sentence     third      this
0  0.000000  0.000000  0.000000  0.739411  0.476063  0.000000  0.476063
1  0.631745  0.000000  0.000000  0.373119  0.480458  0.000000  0.480458
2  0.000000  0.546454  0.546454  0.322745  0.000000  0.546454  0.000000


# TF-IDF Vectorizer of SMS Spam Dataset

In [53]:
def cleaning_texttfidf(txt):
    noPuncTxt = "".join([s for s in txt if s not in string.punctuation])
    splitTxt = re.split('\W+', noPuncTxt.lower())
    noStpWord = [word for word in splitTxt if word not in stopwords]
    return noStpWord

In [54]:
data_sample = data[0:10]
data_sample.head()

Unnamed: 0,Label,SMS,SMS_NoStopWords,SMS_Stem,SMS_Lemma,SMS_Clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]","[go, jurong, point, crazi, avail, bugi, n, great, world, la, e, buffet, cine, got, amor, wat]","[go, jurong, point, crazy, available, bugis, n, great, world, la, e, buffet, cine, got, amore, wat]",go jurong point crazy available bugis n great world la e buffet cine got amore wat
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]","[ok, lar, joke, wif, u, oni]","[ok, lar, joking, wif, u, oni]",ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...","[free, entri, 2, wkli, comp, win, fa, cup, final, tkt, 21st, may, 2005, text, fa, 87121, receiv,...","[free, entry, 2, wkly, comp, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, 87121, receiv...",free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry questions...
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]","[u, dun, say, earli, hor, u, c, alreadi, say]","[u, dun, say, early, hor, u, c, already, say]",u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, dont, think, goes, usf, lives, around, though]","[nah, dont, think, goe, usf, live, around, though]","[nah, dont, think, go, usf, life, around, though]",nah dont think goes usf lives around though


In [55]:
tfidf = TfidfVectorizer(analyzer=cleaning_texttfidf)
tf = tfidf.fit_transform(data_sample['SMS'])
print("Shape--> ",tf.shape)

Shape-->  (10, 124)


In [56]:
df = pd.DataFrame(tf.toarray(),columns=tfidf.get_feature_names())
df.head()

Unnamed: 0,08002986030,08452810075over18s,09061701461,11,12,150,2,2005,21st,3,...,vettam,wat,weeks,wif,win,winner,wkly,word,world,xxx
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.435368,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.193446,0.0,0.0,0.0,0.0,0.193446,0.193446,0.193446,0.0,...,0.0,0.0,0.0,0.0,0.193446,0.0,0.193446,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Expand contractions

In [16]:
import contractions

text = '''She'd like to know how I'd done that! 
          She's going to the park and I don't think I'll be home for dinner.
          Theyre going to the zoo and she'll be home for dinner.'''
  
contractions.fix(text)

ModuleNotFoundError: No module named 'contractions'

# # Remove Emails

In [2]:
import re

sampleText="This is an email id hemanth@gmail.com, akndg@yahoo.com"

output = re.findall(r"([a-zA-Z0-9+._-]+@[a-zA-Z0-9+._-]+\.[a-zA-Z0-9+_-]+)",sampleText)
print(output)

#Substitute with another value then use re.sub(r"([a-zA-Z0-9+._-]+@[a-zA-Z0-9+._-]+\.[a-zA-Z0-9+_-]+)","", sampleText)

['hemanth@gmail.com', 'akndg@yahoo.com']


# Remove URLs

In [4]:
urltext = "this is a url http://jhjahdf.com/asdg  and another url https://sdf.aafa.in/asdfg/cas"

urls = re.findall(r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?",urltext)
print(urls)

[('http', 'jhjahdf.com', '/asdg'), ('https', 'sdf.aafa.in', '/asdfg/cas')]


In [6]:
text = '<p>Contents :</p><a href="https://w3resource.com/asdgsdag/asdsd">Python Examples</a><a href="http://github.com">Even More Examples</a>'
urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', urltext)
print("Original string: ",text)
print("Urls: ",urls)

Original string:  <p>Contents :</p><a href="https://w3resource.com/asdgsdag/asdsd">Python Examples</a><a href="http://github.com">Even More Examples</a>
Urls:  ['http://jhjahdf.com/asdg', 'https://sdf.aafa.in/asdfg/cas']


# Remove html tags

In [1]:
htmltag = """<html>
<body>
<br/><br>
<p>This text is normal.</p>

<p><em>This text is emphasized.</em></p>

</body>
</html>
"""

In [2]:
from bs4 import BeautifulSoup
BeautifulSoup(htmltag,"lxml").get_text()

'\n\n\nThis text is normal.\nThis text is emphasized.\n\n\n'

In [20]:
corpus = ["This is a sentence is",
         "This is another sentence",
         "Third document is here"]
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,1))
words = cv.fit_transform(corpus)

In [21]:
words.toarray()

array([[0, 0, 0, 2, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1],
       [0, 1, 1, 1, 0, 1, 0]], dtype=int64)

In [22]:
import pandas as pd

bow = pd.DataFrame(words.toarray(), columns=cv.get_feature_names())

In [23]:
bow

Unnamed: 0,another,document,here,is,sentence,third,this
0,0,0,0,2,1,0,1
1,1,0,0,1,1,0,1
2,0,1,1,1,0,1,0


In [59]:
tf = bow.copy()

In [60]:
for index, row in enumerate(tf.iterrows()):
    for col in row[1].index:
        tf.loc[index,col] = tf.loc[index,col]/sum(row[1].values)

In [61]:
tf

Unnamed: 0,another,document,here,is,sentence,third,this
0,0.0,0.0,0.0,0.5,0.25,0.0,0.25
1,0.25,0.0,0.0,0.25,0.25,0.0,0.25
2,0.0,0.25,0.25,0.25,0.0,0.25,0.0
