# NLP basics

In [1]:
import nltk
# nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rohit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [101]:
import zipfile
zipfile.ZipFile('smsspamcollection.zip', 'r').extractall()

In [2]:
para = "A paragraph is a series of related sentences developing a central idea, called the topic. Try to think about paragraphs in terms of thematic unity: a paragraph is a sentence or a group of sentences that supports one central, unified idea. Paragraphs add one idea at a time to your broader argument."

In [3]:
sentences = nltk.sent_tokenize(para)

In [4]:
words = nltk.word_tokenize(para)

In [5]:
print(sentences)
print(len(sentences))

['A paragraph is a series of related sentences developing a central idea, called the topic.', 'Try to think about paragraphs in terms of thematic unity: a paragraph is a sentence or a group of sentences that supports one central, unified idea.', 'Paragraphs add one idea at a time to your broader argument.']
3


In [6]:
print(words)
print(len(words))

['A', 'paragraph', 'is', 'a', 'series', 'of', 'related', 'sentences', 'developing', 'a', 'central', 'idea', ',', 'called', 'the', 'topic', '.', 'Try', 'to', 'think', 'about', 'paragraphs', 'in', 'terms', 'of', 'thematic', 'unity', ':', 'a', 'paragraph', 'is', 'a', 'sentence', 'or', 'a', 'group', 'of', 'sentences', 'that', 'supports', 'one', 'central', ',', 'unified', 'idea', '.', 'Paragraphs', 'add', 'one', 'idea', 'at', 'a', 'time', 'to', 'your', 'broader', 'argument', '.']
58


# stemming and lematisation

## Stemming: 
### => [finally, final, finalized] == fina (May noy have meaning, just has the stem word, doesn't take much time)

## Lemmatization:
### => [finally, final, finalized] == final (will have some meaning, takes time, used for getting meaningful output in chatbots)

In [8]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [16]:
stemmed_sentences = []
stemmer = PorterStemmer()
for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    stemmed_sentences.append(' '.join(words))

### stopwords.words('language') gives us the list of words that doesn't further add any meaning

In [13]:
print(stemmed_sentences)

['a paragraph seri relat sentenc develop central idea , call topic .', 'tri think paragraph term themat uniti : paragraph sentenc group sentenc support one central , unifi idea .', 'paragraph add one idea time broader argument .']


In [25]:
from nltk.stem import WordNetLemmatizer
lematiser = WordNetLemmatizer()
from nltk.corpus import stopwords

for i in range(len(sentences)):
    words=nltk.word_tokenize(sentences[i])
    words=[lematiser.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i] = ' '.join(words)

In [24]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rohit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [26]:
sentences

['A paragraph series related sentence developing central idea , called topic .',
 'Try think paragraph term thematic unity : paragraph sentence group sentence support one central , unified idea .',
 'Paragraphs add one idea time broader argument .']

In [27]:
lemmatized_sent = ' '.join(sentences)

In [28]:
lemmatized_sent

'A paragraph series related sentence developing central idea , called topic . Try think paragraph term thematic unity : paragraph sentence group sentence support one central , unified idea . Paragraphs add one idea time broader argument .'

In [29]:
wordx = nltk.word_tokenize(lemmatized_sent)

In [31]:
len(wordx)

38

## Bag of words
### This method gives out list of words and their frequencies, doesn't give importance to main words, all the words have the same weightage. Can't use this for sentiment analysis

In [43]:
# cleaning the text

import re # regular expression
from nltk.stem import WordNetLemmatizer
lemmatizer= WordNetLemmatizer()
sentences = nltk.sent_tokenize(para)
for i in range(len(sentences)):
    sentence = re.sub('[^a-zA-Z]', ' ', sentences[i])
    words=sentence.split()
    words=[word for word in words if word not in set(stopwords.words('english'))]
    words=[lemmatizer.lemmatize(word.lower()) for word in words]
    sentences[i]=' '.join(words)

In [39]:
print(sentences)

['a paragraph series related sentences developing central idea called topic', 'try think paragraphs terms thematic unity paragraph sentence group sentences supports one central unified idea', 'paragraphs add one idea time broader argument']


In [44]:
sentences

['a paragraph series related sentence developing central idea called topic',
 'try think paragraph term thematic unity paragraph sentence group sentence support one central unified idea',
 'paragraph add one idea time broader argument']

In [47]:
# creating bag of words
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
cntv = CountVectorizer()
bow = cntv.fit_transform(sentences)

In [49]:
type(bow)

scipy.sparse.csr.csr_matrix

In [50]:
bow = bow.toarray()

In [51]:
type(bow)

numpy.ndarray

In [52]:
bow

array([[0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 1, 1, 1, 2, 0, 2, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1],
       [1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]],
      dtype=int64)

# Term Frequency & Inverse Document Frequency

### Term freq = (#times the word repeats)/(#of words in sentence) <-- word level in a sentence
### Inverse Document Freq = log[(#of sentences)/(#of sentences containing that word)] <--sentence level in a doc

finally we multiply the both

In [97]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
corpus=[]
Words=[]
sentences=nltk.sent_tokenize(para)
for sentence in sentences:
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)
    print(sentence)
    words= nltk.word_tokenize(sentence)
    words=[word for word in words if word not in set(stopwords.words('english'))]
    words=[lemmatizer.lemmatize(word.lower()) for word in words if word not in set(stopwords.words()) ]
    Words.append(word for word in words)
    corpus.append(" ".join(words))

corpus = tfidf.fit_transform(corpus)

A paragraph is a series of related sentences developing a central idea  called the topic 
Try to think about paragraphs in terms of thematic unity  a paragraph is a sentence or a group of sentences that supports one central  unified idea 
Paragraphs add one idea at a time to your broader argument 


In [98]:
corpus = corpus.toarray()

In [99]:
print(corpus)

[[0.         0.         0.         0.38195621 0.29048754 0.38195621
  0.         0.22558949 0.22558949 0.38195621 0.29048754 0.38195621
  0.         0.         0.         0.         0.         0.38195621
  0.         0.         0.        ]
 [0.         0.         0.         0.         0.21394731 0.
  0.281315   0.16614917 0.33229834 0.         0.42789461 0.
  0.281315   0.281315   0.281315   0.281315   0.         0.
  0.281315   0.281315   0.281315  ]
 [0.46138073 0.46138073 0.46138073 0.         0.         0.
  0.         0.27249889 0.27249889 0.         0.         0.
  0.         0.         0.         0.         0.46138073 0.
  0.         0.         0.        ]]


In [103]:
import pandas as pd
data = pd.read_csv('SMSSpamCollection', sep='\t', names=['labels', 'message'])

In [104]:
print(data.head())

  labels                                            message
0    ham  Go until jurong point, crazy.. Available only ...
1    ham                      Ok lar... Joking wif u oni...
2   spam  Free entry in 2 a wkly comp to win FA Cup fina...
3    ham  U dun say so early hor... U c already then say...
4    ham  Nah I don't think he goes to usf, he lives aro...


### here ham=not spam, spam=spam message

In [118]:
messages=[]
for i in range(len(data)):
    words=re.sub('[^a-zA-Z]',' ', data['message'][i])
    words=nltk.word_tokenize(words)
    words=[PorterStemmer().stem(word) for word in words if word not in set(stopwords.words('english'))]
    
    message = ' '.join(words)
    messages.append(message)

In [119]:
for m in messages:
    print(m)

go jurong point crazi avail bugi n great world la e buffet cine got amor wat
ok lar joke wif u oni
free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate t c appli
u dun say earli hor u c alreadi say
nah i think goe usf live around though
freemsg hey darl week word back i like fun still tb ok xxx std chg send rcv
even brother like speak they treat like aid patent
as per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun
winner as valu network custom select receivea prize reward to claim call claim code kl valid hour
had mobil month u r entitl updat latest colour mobil camera free call the mobil updat co free
i gon na home soon want talk stuff anymor tonight k i cri enough today
six chanc win cash from pound txt csh send cost p day day tsandc appli repli hl info
urgent you week free membership prize jackpot txt word claim no t c www dbuk net lccltd pobox ldnw a rw
i search right word thank breather i 

ya go restaur
it ok askd u knw tht
free st week entri textpod chanc win gb ipod cash everi wk txt pod ts cs www textpod net custcar
those duck chinchilla
i marriag function
look like u wil b get headstart im leav bout ish u r desper compani i could head earlier goin meet rummer
don give fli monkey wot think i certainli mind ani friend mine
as regist optin subscrib ur draw gift voucher enter receipt correct an what no bbc chart
say thank
msg rajini come
ya take ure practic lesson i start june
that good i need drug
stupid possibl
can decid faster co si go home liao
summer final fanci chat flirt sexi singl yr area to get match repli summer free join optout txt stop help
u sleep or go take haha i got spi wat me onlin check n repli mail lor
clair havin borin time alon u wan na cum nite chat hope c u luv clair xx call minmoremobsemspobox po wa
fight world easi u either win lose bt fightng close u dificult u lose u lose u win u still lose
bought one rington get text cost pound offer tone etc


great i church holla get
back brum thank put us keep us happi see soon
i donno scorabl
lt gt great loxahatche xma tree burn updat total see star
ye dont care i need bad princess
the guy kadeem sell sinc break i know one guy paranoid fuck like sell without i til late tonight
sorri i call later
tmr brin lar aiya later come n c lar mayb neva set properli got da help sheet wif
do u knw di lt gt
then dun believ wat
k give back thank
i know complain num bettr directli go bsnl offc nd appli
okay i seen so pick friday
how much pay suganya
left dessert u wan go suntec look u
abeg make profit but start are use get sponsor next event
onum ela pa normal
k k sister kid
cool i text i way
nope meanwhil talk say make greet
cant talk call dont keep call
anyth lar
rose need water season need chang poet need imagin my phone need ur sm need ur love frndship forev
good afternoon babe how goe day ani job prospect yet i miss love sigh
they pick drop car problem
s think wast rr
he world famamu
is come friday 

what time thinkin goin
get free mobil video player free movi to collect text go it free extra film order c appli yr
save money wed lingeri www bridal petticoatdream co uk choos superb select nation deliveri brought weddingfriend
your board work fine the issu overh also reslov but still softwar inst pend i come around clock
ye i care caus i know
wiskey brandi rum gin beer vodka scotch shampain wine kudi yarasu dhina vaazhthukk
mon oki lor haha best cheap n gd food la ex oso oki depend whether wana eat western chines food den u prefer
sit ard noth lor u leh busi w work
it lt gt k oh should send home sale
sorri mail
ya tell abt tht incid
ye outsid like hour and i call whole famili wake caus start
ugh got outta class
nowaday peopl notixiqu laxinorf opportun bambl entrop have ever oblising opt ur book masteriast amplikat fidalf it champlaxig think atroci wotz ur opinion junna
i dont file bag work call tell find anyth room
no need lar ju test e phone card dunno network gd thk me wait si fini

want grasp pretti booti
i got tea sure flavour
i go orchard lareadi reach soon u reach
dear deni word pleas
you know old dom i told yesterday hi name roger he got touch last night want meet today pm
come back to tampa ffffuuuuuuu
celebr b day els
merri christma u anni
pleas tell special stock talk
i sent do like
urgent ur award complimentari trip eurodisinc trav aco entri or to claim txt di morefrmmob shracomorsglsuplt ls aj
awesom minut
and problem you walk around julianaland oblivi go around i say thing constantli go one ear go whatev want it know i upset listen tell what go upset then want surpris i mad
i told everyth stop just dont let get dehydr
or i guess lt gt min
i home ard wat time u reach
storm msg wen u lift phne u say hello do u knw wt real mean hello it name girl ye and u knw dat girl margaret hello she girlfrnd f grahmbel invnt telphon moral one get name person bt girlfrnd g n g h
if want mapquest someth look usf dogwood drive tini street park lot
aight i plan come later 

thank gener date brotha
by way make sure u get train worc foreg street shrub hill have fun night x
i thought get watch co that kind thing u get th and love much
you guarante award mayb even cash claim ur award call free it legitimat efreefon number wat u think
good morn at repair shop onli reason hour
and fine i got enough bud last night least
i back good journey let know need receipt shall tell like pendent
so take away money worri
aight pick open tonight
latest new polic station toilet stolen cop noth go
sac need carri
just sing hu i think also import find someon femal know place well prefer citizen also smart help navig even thing like choos phone plan requir guidanc when doubt ask especi girl
what hello wat talk email address
except there chick huge boob
im wonder right
wish beauti day each moment reveal even thing keep smile do enjoy
for sparkl shop break per person call visit www shortbreak org uk
arun u transfr amt
sorri i call later
if hear loud scream lt gt minut caus gyno sho

In [120]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
# creating bag of words
x = cv.fit_transform(messages)

In [123]:
x=x.toarray()

In [125]:
print(x)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [127]:
y=data.iloc[:,1].values

In [128]:
y=pd.get_dummies(data['labels'])

In [129]:
print(y.head())

   ham  spam
0    1     0
1    1     0
2    0     1
3    1     0
4    1     0


In [131]:
y=y.iloc[:,1].values

In [149]:
from sklearn.model_selection import train_test_split

x_tr, x_t, y_tr, y_t = train_test_split(x, y, test_size=0.3, random_state=0)

In [150]:
print(y_tr)

[0 0 0 ... 1 0 0]


In [151]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB().fit(x_tr,y_tr)

In [152]:
y_pred = nb.predict(x_t)

In [153]:
from sklearn.metrics import confusion_matrix, accuracy_score
print(confusion_matrix(y_t, y_pred))
print(accuracy_score(y_t, y_pred))

[[1434   17]
 [  12  209]]
0.9826555023923444


# word2vec

### helps in maintaining semantic  or a relation between the words, which is missing in BAG OF WORDS and TFIDF, this converts all the words into vector of dimentions >= 32