## Count Vectorizer

https://medium.com/data-science-ecom-express/focal-loss-for-handling-the-issue-of-class-imbalance-be7addebd856

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import pandas as pd
import numpy as np

In [None]:
# Create our vectorizer
vectorizer = CountVectorizer()


In [None]:
# All data
newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test',
                                     remove=('headers', 'footers', 'quotes'))

In [None]:
df = pd.DataFrame(data=newsgroups_train['data'])

In [None]:
df['target'] = newsgroups_train['target']
df.head()

Unnamed: 0,0,target
0,I was wondering if anyone out there could enli...,7
1,A fair number of brave souls who upgraded thei...,4
2,"well folks, my mac plus finally gave up the gh...",4
3,\nDo you have Weitek's address/phone number? ...,1
4,"From article <C5owCB.n3p@world.std.com>, by to...",14


In [None]:
print(len(newsgroups_train))
len(newsgroups_test)

5


5

In [None]:
# Get the training vectors
vectors = vectorizer.fit_transform(newsgroups_train.data)

In [None]:
# Build the classifier
clf = MultinomialNB(alpha=.01)

#  Train the classifier
clf.fit(vectors, newsgroups_train.target)

In [None]:
# Get the test vectors
vectors_test = vectorizer.transform(newsgroups_test.data)

In [None]:
# Predict and score the vectors
pred = clf.predict(vectors_test)
acc_score = metrics.accuracy_score(newsgroups_test.target, pred)
f1_score = metrics.f1_score(newsgroups_test.target, pred, average='macro')

print('Total accuracy classification score: {}'.format(acc_score))
print('Total F1 classification score: {}'.format(f1_score))

Total accuracy classification score: 0.6460435475305364
Total F1 classification score: 0.6203806145034193


## TF-IDF Vectorizer

https://medium.com/@cmukesh8688/tf-idf-vectorizer-scikit-learn-dbc0244a911a

https://towardsdatascience.com/word2vec-research-paper-explained-205cb7eecc30

https://www.youtube.com/watch?v=LSS_bos_TPI





In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidfvectorizer = TfidfVectorizer()


In [None]:
tfidf_wm = tfidfvectorizer.fit_transform(newsgroups_train.data)


In [None]:
# Build the classifier
clf = MultinomialNB(alpha=.01)

#  Train the classifier
clf.fit(tfidf_wm, newsgroups_train.target)

In [None]:
# Get the test vectors
tf_idf_train = tfidfvectorizer.transform(newsgroups_test.data)

In [None]:
# Predict and score the vectors
pred = clf.predict(tf_idf_train)
acc_score = metrics.accuracy_score(newsgroups_test.target, pred)
f1_score = metrics.f1_score(newsgroups_test.target, pred, average='macro')

print('Total accuracy classification score: {}'.format(acc_score))
print('Total F1 classification score: {}'.format(f1_score))

Total accuracy classification score: 0.7002124269782263
Total F1 classification score: 0.682861129525057


## Text Similarity


*   [Jaccard Similarity](https://www.educative.io/answers/what-is-the-jaccard-similarity-measure-in-nlp)
*   [Fuzzy Logic](https://towardsdatascience.com/natural-language-processing-for-fuzzy-string-matching-with-python-6632b7824c49)
*  [Cosine Similarity](https://paulminogue.com/index.php/2019/09/29/introduction-to-cosine-similarity/)







In [None]:
def jaccard_similarity(a, b):
    # convert to set
    a = set(a)
    b = set(b)
    # calucate jaccard similarity
    j = float(len(a.intersection(b))) / len(a.union(b))
    return j

In [None]:
l1 = ["no", "high", "no", "low", "only", "doge"]
l2 = ["one", "word", "doge"]
jaccard_similarity(l1, l2)

0.14285714285714285

In [None]:
jaccard_similarity("morning", "morning")


1.0

In [None]:
!pip install fuzzywuzzy
!pip install python-Levenshtein

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting python-Levenshtein
  Downloading python_Levenshtein-0.21.1-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.21.1 (from python-Levenshtein)
  Downloading Levenshtein-0.21.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (172 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.5/172.5 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=2.3.0 (from Levenshtein==0.21.1->python-Levenshtein)
  Downloading rapidfuzz-3.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB

In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [None]:
fuzz.ratio('string with space', 'stringwithspace')

94

In [None]:
# Exact match
fuzz.ratio('exact string', 'exact string')

100

In [None]:
fuzz.ratio('string with sapce', 'String With Space ')


74

In [None]:
import nltk
# uncomment and run the code below
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   U

True

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# X = input("Enter first string: ").lower()
# Y = input("Enter second string: ").lower()
X ="I love horror movies"
Y ="Lights out is a horror movie"

# tokenization
X_list = word_tokenize(X)
Y_list = word_tokenize(Y)

# sw contains the list of stopwords
sw = stopwords.words('english')
l1 =[];l2 =[]

# remove stop words from the string
X_set = {w for w in X_list if not w in sw}
Y_set = {w for w in Y_list if not w in sw}

# form a set containing keywords of both strings
rvector = X_set.union(Y_set)
for w in rvector:
    if w in X_set: l1.append(1) # create a vector
    else: l1.append(0)
    if w in Y_set: l2.append(1)
    else: l2.append(0)
c = 0

# cosine formula
for i in range(len(rvector)):
        c+= l1[i]*l2[i]
cosine = c / float((sum(l1)*sum(l2))**0.5)
print("similarity: ", cosine)

similarity:  0.2886751345948129


In [None]:
from nltk.test.gensim_fixt import setup_module
setup_module()

In [None]:
import gensim

In [None]:
from nltk.corpus import brown
train_set = brown.sents()[:10000]
model = gensim.models.Word2Vec(train_set)

In [None]:
model

<gensim.models.word2vec.Word2Vec at 0x7f70561bf1c0>

In [None]:
model.wv.similarity('university','school')

0.99336874

In [None]:
from nltk.data import find
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

In [None]:
model.most_similar(positive=['university'])


[('universities', 0.7003918290138245),
 ('faculty', 0.6780906915664673),
 ('undergraduate', 0.6587096452713013),
 ('campus', 0.6434987783432007),
 ('college', 0.638526976108551),
 ('academic', 0.6317198276519775),
 ('professors', 0.6298646926879883),
 ('undergraduates', 0.6149812936782837),
 ('University', 0.6139305233955383),
 ('student', 0.600540041923523)]