In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
with open('./text','r') as f:
    data = f.read()

In [3]:
data

'virat smashed another ton\nworld cup start at 30th may 2019 in english\nnarendera modi visits andhra pradesh for election summit\ncurrently phase 9 election is completed in west bangal\ntemperature in india turning 50 due to global warming\n'

In [4]:
 Q = 'when worlscup starts ?'

# Word Embedding 
 - Bag of words
 - TFIDF

#  Word Embedding
 - Bag of Word(BOW

Text Cleaning
- converting in to lower
- remove special characters


In [5]:
data = data.lower()

In [6]:
import re
import nltk
# nltk.sent_tokenize(data)

In [7]:
docs = data.split('\n')
df = pd.DataFrame(docs,columns=['Documents'])
df = df.iloc[:5,:]

# Text cleaning
 - Lower
 - remove special characters

In [8]:
def textprocess(data):
    data = data.lower() # converting in to lower case
    data = re.sub(r'[^a-z0-9]+',' ',data)
    return data

In [9]:
df['Documents'] = df['Documents'].apply(textprocess)

In [10]:
df

Unnamed: 0,Documents
0,virat smashed another ton
1,world cup start at 30th may 2019 in english
2,narendera modi visits andhra pradesh for elect...
3,currently phase 9 election is completed in wes...
4,temperature in india turning 50 due to global ...


# Lemma 
 - taking root word

In [11]:
from nltk.stem import wordnet
from nltk import pos_tag # parts of speach taging

In [12]:
lema = wordnet.WordNetLemmatizer()

In [13]:
sent = df['Documents'].loc[0]
sent

'virat smashed another ton'

In [14]:
tokens = sent.split()

In [15]:
pos_tag(tokens)  # tokens are known as words

[('virat', 'NN'), ('smashed', 'VBD'), ('another', 'DT'), ('ton', 'NN')]

$# CC	coordinating conjunction
# CD	cardinal digit
# DT	determiner
# EX	existential there (like: "there is" ... think of it like "there exists")
# FW	foreign word
# IN	preposition/subordinating conjunction
# JJ	adjective	'big'
# JJR	adjective, comparative	'bigger'
# JJS	adjective, superlative	'biggest'
# LS	list marker	1)
# MD	modal	could, will
# NN	noun, singular 'desk'
# NNS	noun plural	'desks'
# NNP	proper noun, singular	'Harrison'
# NNPS	proper noun, plural	'Americans'
# PDT	predeterminer	'all the kids'
# POS	possessive ending	parent's
# PRP	personal pronoun	I, he, she
# PRP	possessive pronoun	my, his, hers
# RB	adverb	very, silently,
# RBR	adverb, comparative	better
# RBS	adverb, superlative	best
# RP	particle	give up
# TO	to	go 'to' the store.
# UH	interjection	errrrrrrrm
# VB	verb, base form	take
# VBD	verb, past tense	took
# VBG	verb, gerund/present participle	taking
# VBN	verb, past participle	taken
# VBP	verb, sing. present, non-3d	take
# VBZ	verb, 3rd person sing. present	takes
# WDT	wh-determiner	which
# WP	wh-pronoun	who, what
# WP$	possessive wh-pronoun	whose
$# WRB	wh-abverb	where, when

In [16]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/praveen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
lema.lemmatize('are',pos='v')

'be'

In [18]:
lema.lemmatize('smashed',pos='n') # nown returns same word

'smashed'

In [19]:
wordnet.wordnet.NOUN

'n'

In [20]:
wordnet.wordnet.ADJ

'a'

In [21]:
wordnet.wordnet.ADV

'r'

In [22]:
wordnet.wordnet.VERB

'v'

In [23]:
# df = pd.DataFrame(pos_tag(tokens))


In [24]:
# df.columns = ['words','pos']

In [25]:
# poc1 = []
# for i in df['pos']:
#     if i == 'NN':
#         poc1.append('n')
#     elif i =='ADj':
#         poc1.append('a')
#     elif i =='ADV':
#         poc1.append('r')
    
        

# another method sirs

In [26]:
pos = pos_tag(tokens)

In [27]:
def lemmatizer(pos_tag):
    word,pos=pos_tag
    if pos.startswith('R'):
        pos = 'r'
    elif pos.startswith('V'):
        pos = 'v'
    elif pos.startswith('J'):
        pos ='a'
    else:
        pos='n'
    return lema.lemmatize(word,pos=pos)

In [28]:
def joinlema(sent):
    tokens = sent.split()
    pos = pos_tag(tokens)
    return " ".join([lemmatizer(tag) for tag in pos])

In [29]:
df['lemm'] = df['Documents'].apply(joinlema)

In [30]:
df

Unnamed: 0,Documents,lemm
0,virat smashed another ton,virat smash another ton
1,world cup start at 30th may 2019 in english,world cup start at 30th may 2019 in english
2,narendera modi visits andhra pradesh for elect...,narendera modi visit andhra pradesh for electi...
3,currently phase 9 election is completed in wes...,currently phase 9 election be complete in west...
4,temperature in india turning 50 due to global ...,temperature in india turn 50 due to global war...


# TF-IDF
$tfidf = log(TF+1) * log(\frac{N}{DF+1})$
 - TF = Term Frequency
 - DF = Document Frequency
 - N = Number of times word is repeated in entire corpus
- term=word
### +1 is added because the value should not be == infinite

In [31]:
print(data)

virat smashed another ton
world cup start at 30th may 2019 in english
narendera modi visits andhra pradesh for election summit
currently phase 9 election is completed in west bangal
temperature in india turning 50 due to global warming



In [32]:
from nltk.corpus import stopwords

In [33]:
stop = stopwords.words('english')

# Insted of rempoving stopwords we apply TFIDF

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [35]:
tfidf = TfidfVectorizer()

In [36]:
X = tfidf.fit_transform(df['lemm']).toarray()

In [48]:
Q = 'when world cup starts?'

In [49]:
query = textprocess(Q)
query = joinlema(query)
query_vector = tfidf.transform([query]).toarray()

In [50]:
query_vector.shape

(1, 35)

In [51]:
from sklearn.metrics.pairwise import cosine_similarity

In [53]:
cosine_similarity(X,query_vector)

array([[0.      ],
       [0.595896],
       [0.      ],
       [0.      ],
       [0.      ]])

In [54]:
# problem with steming is // it converts another to anoth which is not there in dictionary 