# Types of word similarities

1. Cosine Similarity - Cosine of the angle between two vectors
2. Jaccard Similarity - score is calculated on union/intersection of words
3. Levenshtein DIstance - # of insertions, deletions and substitutions required to convert string a to b
4. Hamming distance - # of positions with the same symbols in both strings

In [4]:
documents = (
"I like NLP",
"I am exploring NLP",
"I am a beginner in NLP",
"I want to learn NLP",
"I like advanced NLP"
)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
tfidf = TfidfVectorizer()

In [4]:
features = tfidf.fit_transform(documents)

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
cosine_similarity(features[0],features)

array([[1.        , 0.17682765, 0.14284054, 0.13489366, 0.68374784]])

# Phonetic Matching

In [76]:
#!pip install fuzzy

In [10]:
soundex = fuzzy.Soundex(4)

NameError: name 'fuzzy' is not defined

In [None]:
soundex('electrcty')

In [None]:
soundex('electricity')

# Tagging POS

In [6]:
from nltk import pos_tag, pos_tag_sents

In [22]:
k = pos_tag('I like artificial intelligence'.split())
k

[('I', 'PRP'), ('like', 'VBP'), ('artificial', 'JJ'), ('intelligence', 'NN')]

In [12]:
import pandas as pd

In [19]:
print(pd.DataFrame(k).to_html(index=False))

<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th>0</th>
      <th>1</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>I</td>
      <td>PRP</td>
    </tr>
    <tr>
      <td>like</td>
      <td>VBP</td>
    </tr>
    <tr>
      <td>artificial</td>
      <td>JJ</td>
    </tr>
    <tr>
      <td>intelligence</td>
      <td>NN</td>
    </tr>
  </tbody>
</table>


In [8]:
pos_tag_sents([sent.split() for sent in documents])

[[('I', 'PRP'), ('like', 'VBP'), ('NLP', 'NNP')],
 [('I', 'PRP'), ('am', 'VBP'), ('exploring', 'VBG'), ('NLP', 'NNP')],
 [('I', 'PRP'),
  ('am', 'VBP'),
  ('a', 'DT'),
  ('beginner', 'NN'),
  ('in', 'IN'),
  ('NLP', 'NNP')],
 [('I', 'PRP'),
  ('want', 'VBP'),
  ('to', 'TO'),
  ('learn', 'VB'),
  ('NLP', 'NNP')],
 [('I', 'PRP'), ('like', 'VBP'), ('advanced', 'JJ'), ('NLP', 'NNP')]]

# Entity Extraction

In [29]:
# Using NLTK

In [38]:
import nltk

In [30]:
from nltk import ne_chunk

In [47]:
from nltk import word_tokenize, pos_tag

In [49]:
sent = "John is studying at Stanford University in California"

In [50]:
# nltk.download('maxent_ne_chunker')

In [51]:
# nltk.download('words')

In [77]:
#ne_chunk(pos_tag(word_tokenize(sent)), binary=False)

In [54]:
# Using spacy

In [58]:
#!pip install spacy

In [59]:
import spacy

In [75]:
#!python -m spacy download en_core_web_sm

In [70]:
# nlp = spacy.load('en_core_web_sm') - didnt work
import en_core_web_sm

In [71]:
nlp = en_core_web_sm.load()

In [72]:
doc = nlp(u'Apple is ready to launch new phone worth $10000 in New york time square')

In [74]:
for e in doc.ents:
    print(e.text,e.label_)

Apple ORG
10000 MONEY
New york GPE


# Extracting topics - dropped

# Classifying Texts

1. Spam/ham detection
2. Sentiment Analysis
3. Disambiguating text
4. Speech to text
5. Text to speech
6. Language Detection and Translation

In [90]:
# Spam detection

In [91]:
import pandas as pd

In [126]:
dataset = pd.read_csv(r'spam.csv',encoding='latin1')

In [127]:
dataset

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [128]:
# cleaning dataset

In [129]:
dataset = dataset[['v2','v1']]

In [130]:
# Starting text preprocessin gpipeline
# Lowering text

In [131]:
dataset['v2'] = dataset['v2'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['v2'] = dataset['v2'].apply(lambda x: x.lower())


In [132]:
# Remove Stopwords

In [133]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [134]:
dataset['v2'] = dataset['v2'].apply(lambda x : ' '.join([word for word in x.split() if word not in stop]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['v2'] = dataset['v2'].apply(lambda x : ' '.join([word for word in x.split() if word not in stop]))


In [135]:
# Stemming

In [136]:
from nltk.stem import SnowballStemmer

In [137]:
stemmer = SnowballStemmer(language='english')

In [138]:
dataset['v2'] = dataset['v2'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['v2'] = dataset['v2'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))


In [140]:
# Lemmatize

In [146]:
from textblob import Word

In [147]:
dataset['v2'] = dataset['v2'].apply(lambda x: ' '.join([Word(word).lemmatize() for word in x.split()]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['v2'] = dataset['v2'].apply(lambda x: ' '.join([Word(word).lemmatize() for word in x.split()]))


In [148]:
dataset

Unnamed: 0,v2,v1
0,"go jurong point, crazy.. avail bugi n great wo...",ham
1,ok lar... joke wif u oni...,ham
2,free entri 2 wkli comp win fa cup final tkts 2...,spam
3,u dun say earli hor... u c alreadi say...,ham
4,"nah think goe usf, live around though",ham
...,...,...
5567,2nd time tri 2 contact u. u å£750 pound prize....,spam
5568,ì_ b go esplanad fr home?,ham
5569,"pity, * mood that. so...ani suggestions?",ham
5570,guy bitch act like i'd interest buy someth el ...,ham


In [151]:
# Train Test Split

In [152]:
from sklearn.model_selection import train_test_split

In [153]:
X_train, X_test, y_train, y_test = train_test_split(dataset['v2'],dataset['v1'], test_size = 0.1, shuffle = True)

In [154]:
#Label encoding target

In [157]:
from sklearn.preprocessing import LabelEncoder

In [159]:
lbl_enc = LabelEncoder()
y_train = lbl_enc.fit_transform(y_train)
y_test = lbl_enc.transform(y_test)

In [160]:
# Tfidf with features}

In [161]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [162]:
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

In [163]:
# Training a MultinomialNB

In [164]:
from sklearn.naive_bayes import MultinomialNB

In [165]:
mnb = MultinomialNB()

In [166]:
mnb.fit(X_train,y_train)
y_pred = mnb.predict(X_test)

In [167]:
from sklearn.metrics import accuracy_score

In [168]:
accuracy_score(y_test,y_pred)

0.978494623655914

In [169]:
from sklearn.linear_model import SGDClassifier

In [173]:
sgd = SGDClassifier()
sgd.fit(X_train,y_train)
y_pred = sgd.predict(X_test)
accuracy_score(y_test,y_pred)

0.9946236559139785

In [174]:
from sklearn.metrics import confusion_matrix

In [178]:
confusion_matrix(y_test,y_pred)

array([[487,   0],
       [  3,  68]], dtype=int64)

In [179]:
accuracy_score(y_train,sgd.predict(X_train))

0.9990027921818907

In [180]:
confusion_matrix(y_train,sgd.predict(X_train))

array([[4337,    1],
       [   4,  672]], dtype=int64)

# Sentiment Analysis

In [1]:
from textblob import TextBlob

In [2]:
sentence = 'coffee is so good at this place'

In [3]:
tb = TextBlob(sentence)

In [7]:
tb.sentiment

Sentiment(polarity=0.7, subjectivity=0.6000000000000001)

In [203]:
# Khatamm

# Disambiguating Text

In [1]:
sentences = ['I went to bank for money deposit',
             'a river bank for fishing']

In [15]:
sentences2 = ['I am eating an apple',
             'apple iphone costs 25000 usd']

In [16]:
#!pip install pywsd

In [17]:
from pywsd.lesk import simple_lesk

In [18]:
answer0 = simple_lesk(sentences2[0],'apple')

In [19]:
answer0.definition()

'native Eurasian tree widely cultivated in many varieties for its firm rounded edible fruits'

In [20]:
answer1 = simple_lesk(sentences2[1],'apple')

In [21]:
answer1.definition()

'native Eurasian tree widely cultivated in many varieties for its firm rounded edible fruits'

# Language Detection and Translation 

### Only runs while online and not offline

In [284]:
#!pip install goslate

In [253]:
import goslate

In [255]:
gs = goslate.Goslate()

In [256]:
text = 'bonjour le monde'

In [258]:
gs.detect(text)

'fr'

In [283]:
gs.translate(text,'en')

'Hi world'

In [None]:
#