# Tokenization

Tokenization is the process of breaking down the given text in natural language processing into the smallest unit in a sentence called a token. Punctuation marks, words, and numbers can be considered tokens.

In [1]:
text = 'Hi Everyone! This is Hackers Realm. We are learning Natural Language Processing. We reached 1000000 views.'

In [2]:
text.split(' ')

['Hi',
 'Everyone!',
 'This',
 'is',
 'Hackers',
 'Realm.',
 'We',
 'are',
 'learning',
 'Natural',
 'Language',
 'Processing.',
 'We',
 'reached',
 '1000000',
 'views.']

In [3]:
from nltk import sent_tokenize, word_tokenize

D:\shanu_user\Lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
D:\shanu_user\Lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


In [4]:
# split the text into sentences
sent_tokens = sent_tokenize(text)
sent_tokens

['Hi Everyone!',
 'This is Hackers Realm.',
 'We are learning Natural Language Processing.',
 'We reached 1000000 views.']

In [5]:
# split the text into words
word_tokens = word_tokenize(text)
word_tokens

['Hi',
 'Everyone',
 '!',
 'This',
 'is',
 'Hackers',
 'Realm',
 '.',
 'We',
 'are',
 'learning',
 'Natural',
 'Language',
 'Processing',
 '.',
 'We',
 'reached',
 '1000000',
 'views',
 '.']

# Stemming

Stemming is the process of finding the root of words. A word stem need not be the same root as a dictionary-based morphological root, it just is an equal to or smaller form of the word.

In [6]:
from nltk.stem import PorterStemmer, SnowballStemmer
ps = PorterStemmer()

In [7]:
word = ('eats')
ps.stem(word)

'eat'

In [8]:
word = ('eating')
ps.stem(word)

'eat'

In [9]:
word = ('eaten')
ps.stem(word)

'eaten'

In [10]:
text = 'Hi Everyone! This is Hackers Realm. We are learning Natural Language Processing. We reached 1000000 views.'

In [11]:
word_tokens = word_tokenize(text)

In [12]:
stemmed_sentence = " ".join(ps.stem(word) for word in word_tokens)
stemmed_sentence

'hi everyon ! thi is hacker realm . we are learn natur languag process . we reach 1000000 view .'

# Lemmatization

Lemmatization is the process of finding the form of the related word in the dictionary. It is different from Stemming. It involves longer processes to calculate than Stemming.

In [13]:
#import nltk
#nltk.download('wordnet')

In [14]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [15]:
lemmatizer.lemmatize('workers')

'worker'

In [16]:
lemmatizer.lemmatize('words')

'word'

In [17]:
lemmatizer.lemmatize('feet')

'foot'

In [18]:
lemmatizer.lemmatize('stripes', 'v')

'strip'

In [19]:
lemmatizer.lemmatize('stripes', 'n')

'stripe'

In [20]:
text = 'Hi Everyone! This is Hackers Realm. We are learning Natural Language Processing. We reached 1000000 views.'

In [21]:
word_tokens = word_tokenize(text)

In [22]:
lemmatized_sentence = " ".join(lemmatizer.lemmatize(word.lower()) for word in word_tokens)
lemmatized_sentence

'hi everyone ! this is hacker realm . we are learning natural language processing . we reached 1000000 view .'

# Part of Speech Tagging (POS)

Part of Speech Tagging is a process of converting a sentence to forms — list of words, list of tuples (where each tuple is having a form (word, tag)). The tag in case of is a part-of-speech tag, and signifies whether the word is a noun, adjective, verb, and so on.

https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [23]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [Errno 11001] getaddrinfo failed>


False

In [24]:
from nltk import pos_tag

In [25]:
pos_tag(['fighting'])

[('fighting', 'VBG')]

In [26]:
text = 'Hi Everyone! This is Hackers Realm. We are learning Natural Language Processing. We reached 1000000 views.'

In [27]:
word_tokens = word_tokenize(text)

In [28]:
pos_tag(word_tokens)

[('Hi', 'NNP'),
 ('Everyone', 'NN'),
 ('!', '.'),
 ('This', 'DT'),
 ('is', 'VBZ'),
 ('Hackers', 'NNP'),
 ('Realm', 'NNP'),
 ('.', '.'),
 ('We', 'PRP'),
 ('are', 'VBP'),
 ('learning', 'VBG'),
 ('Natural', 'NNP'),
 ('Language', 'NNP'),
 ('Processing', 'NNP'),
 ('.', '.'),
 ('We', 'PRP'),
 ('reached', 'VBD'),
 ('1000000', 'CD'),
 ('views', 'NNS'),
 ('.', '.')]

# Text Preprocessing (Clean Data)

In [29]:
import pandas as pd
import string
df = pd.read_csv('data/Twitter Sentiments.csv')
# drop the columns
df = df.drop(columns=['id', 'label'], axis=1)
df.head()

Unnamed: 0,tweet
0,@user when a father is dysfunctional and is s...
1,@user @user thanks for #lyft credit i can't us...
2,bihday your majesty
3,#model i love u take with u all the time in ...
4,factsguide: society now #motivation


## Convert to lowercase

In [30]:
df['clean_text'] = df['tweet'].str.lower()
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,@user when a father is dysfunctional and is s...
1,@user @user thanks for #lyft credit i can't us...,@user @user thanks for #lyft credit i can't us...
2,bihday your majesty,bihday your majesty
3,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
4,factsguide: society now #motivation,factsguide: society now #motivation


## Removal of Punctuations

In [31]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [32]:
def remove_punctuations(text):
    punctuations = string.punctuation
    return text.translate(str.maketrans('', '', punctuations))

In [33]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_punctuations(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is so...
1,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i cant use ca...
2,bihday your majesty,bihday your majesty
3,#model i love u take with u all the time in ...,model i love u take with u all the time in u...
4,factsguide: society now #motivation,factsguide society now motivation


## Removal of Stopwords

In [34]:
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

In [35]:
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])

In [36]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,user father dysfunctional selfish drags kids d...
1,@user @user thanks for #lyft credit i can't us...,user user thanks lyft credit cant use cause do...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model love u take u time urð± ðððð...
4,factsguide: society now #motivation,factsguide society motivation


## Removal of Frequent Words

In [37]:
from collections import Counter
word_count = Counter()
for text in df['clean_text']:
    for word in text.split():
        word_count[word] += 1
        
word_count.most_common(10)

[('user', 17473),
 ('love', 2647),
 ('day', 2198),
 ('happy', 1663),
 ('amp', 1582),
 ('im', 1139),
 ('u', 1136),
 ('time', 1110),
 ('life', 1086),
 ('like', 1042)]

In [38]:
FREQUENT_WORDS = set(word for (word, wc) in word_count.most_common(3))
def remove_freq_words(text):
    return " ".join([word for word in text.split() if word not in FREQUENT_WORDS])

In [39]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_freq_words(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model u take u time urð± ðððð ð...
4,factsguide: society now #motivation,factsguide society motivation


## Removal of Rare Words

In [40]:
RARE_WORDS = set(word for (word, wc) in word_count.most_common()[:-10:-1])
RARE_WORDS

{'airwaves',
 'carnt',
 'chisolm',
 'ibizabringitonmallorcaholidayssummer',
 'isz',
 'mantle',
 'shirley',
 'youuuð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8dâ\x9d¤ï¸\x8f',
 'ð\x9f\x99\x8fð\x9f\x8f¼ð\x9f\x8d¹ð\x9f\x98\x8eð\x9f\x8eµ'}

In [41]:
def remove_rare_words(text):
    return " ".join([word for word in text.split() if word not in RARE_WORDS])

In [42]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_rare_words(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model u take u time urð± ðððð ð...
4,factsguide: society now #motivation,factsguide society motivation


## Removal of Special characters

In [43]:
import re
def remove_spl_chars(text):
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text

In [44]:
df['clean_text'] = df['clean_text'].apply(lambda x: remove_spl_chars(x))
df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model u take u time ur
4,factsguide: society now #motivation,factsguide society motivation


## Stemming

In [45]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stem_words(text):
    return " ".join([ps.stem(word) for word in text.split()])

In [46]:
df['stemmed_text'] = df['clean_text'].apply(lambda x: stem_words(x))
df.head()

Unnamed: 0,tweet,clean_text,stemmed_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...,father dysfunct selfish drag kid dysfunct run
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...,thank lyft credit cant use caus dont offer whe...
2,bihday your majesty,bihday majesty,bihday majesti
3,#model i love u take with u all the time in ...,model u take u time ur,model u take u time ur
4,factsguide: society now #motivation,factsguide society motivation,factsguid societi motiv


## Lemmatization & POS Tagging

In [47]:
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

def lemmatize_words(text):
    # find pos tags
    pos_text = pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_text])

In [48]:
wordnet.NOUN

'n'

In [49]:
df['lemmatized_text'] = df['clean_text'].apply(lambda x: lemmatize_words(x))
df.head()

Unnamed: 0,tweet,clean_text,stemmed_text,lemmatized_text
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...,father dysfunct selfish drag kid dysfunct run,father dysfunctional selfish drag kid dysfunct...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...,thank lyft credit cant use caus dont offer whe...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty,bihday majesti,bihday majesty
3,#model i love u take with u all the time in ...,model u take u time ur,model u take u time ur,model u take u time ur
4,factsguide: society now #motivation,factsguide society motivation,factsguid societi motiv,factsguide society motivation


In [50]:
df.sample(frac=1).head(10)

Unnamed: 0,tweet,clean_text,stemmed_text,lemmatized_text
26012,my latest post: the masses â some kinda mons...,latest post masses kinda monster democracy,latest post mass kinda monster democraci,late post mass kinda monster democracy
2331,"when angels forget to fly, when it's 20 below...",angels forget fly 20 july violets red roses bl...,angel forget fli 20 juli violet red rose blue ...,angel forget fly 20 july violet red rose blue ...
13798,@user it is coming.........7 july release dat...,coming7 july release date lmll ibiza techno,coming7 juli releas date lmll ibiza techno,coming7 july release date lmll ibiza techno
1638,indeed fathers day.god bless my dad,indeed fathers daygod bless dad,inde father daygod bless dad,indeed father daygod bless dad
17926,"@user if a ""negro"" is good he is allowed to go...",negro good allowed go heaven servant eternity ...,negro good allow go heaven servant etern accor...,negro good allow go heaven servant eternity ac...
20572,wellybel - na: #xxx #sexy #slut #nasty #porn...,wellybel na xxx sexy slut nasty porn naughty h...,wellybel na xxx sexi slut nasti porn naughti h...,wellybel na xxx sexy slut nasty porn naughty h...
7848,@user so that @user is presenting âcurves...,presenting curves confidence clothesshow 2016 ...,present curv confid clothesshow 2016 get ticket,present curve confidence clothesshow 2016 get ...
1972,lord i hope may result na next week. para next...,lord hope may result na next week para next pr...,lord hope may result na next week para next pr...,lord hope may result na next week para next pr...
17937,a giant version of the #flag of #rebellion #se...,giant version flag rebellion sedition flapping...,giant version flag rebellion sedit flap virgin...,giant version flag rebellion sedition flap vir...
8982,now that i have a job despite no paychecks i h...,job despite paychecks 2 pay food nowcuz tips get,job despit paycheck 2 pay food nowcuz tip get,job despite paycheck 2 pay food nowcuz tip get


## Removal of URLs

In [51]:
text = "https://www.hackersrealm.net is the URL of the channel Hackers Realm"

In [52]:
def remove_url(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

In [53]:
remove_url(text)

' is the URL of the channel Hackers Realm'

## Removal of HTML Tags

In [54]:
text = "<html><body> <h1>Hackers Realm</h1> <p>This is NLP text preprocessing tutorial</p> </body></html>"

In [55]:
def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)

In [56]:
remove_html_tags(text)

' Hackers Realm This is NLP text preprocessing tutorial '

## Spelling Correction

In [57]:
#!pip install pyspellchecker

In [58]:
text = 'natur is a beuty'

In [59]:
from spellchecker import SpellChecker
spell = SpellChecker()

def correct_spellings(text):
    corrected_text = []
    misspelled_text = spell.unknown(text.split())
    # print(misspelled_text)
    for word in text.split():
        if word in misspelled_text:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
            
    return " ".join(corrected_text)

In [60]:
correct_spellings(text)

'nature is a beauty'

# Feature Extraction from Text Data

## Bag of Words

A bag-of-words is a representation of text that describes the occurrence of words within a document. It involves two things: A vocabulary of known words. A measure of the presence of known words.

In [61]:
text_data = ['I am interested in NLP', 'This is a good tutorial with good topic', 'Feature extraction is very important topic']

In [62]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(stop_words='english')

In [63]:
# fit the data
bow.fit(text_data)

In [64]:
from sklearn.feature_extraction.text import CountVectorizer

# Define the text data
text_data = ['I am interested in NLP', 
             'This is a good tutorial with good topic', 
             'Feature extraction is very important topic']

# Create a CountVectorizer instance
vectorizer = CountVectorizer()

# Fit and transform the documents
X = vectorizer.fit_transform(text_data)

# Get the feature names
feature_names = vectorizer.get_feature_names_out()

print(feature_names)


['am' 'extraction' 'feature' 'good' 'important' 'in' 'interested' 'is'
 'nlp' 'this' 'topic' 'tutorial' 'very' 'with']


In [65]:
# Define the text data
text_data = ['I am interested in NLP', 
             'This is a good tutorial with good topic', 
             'Feature extraction is very important topic']

# Import the necessary libraries
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Create a CountVectorizer instance with English stop words
bow = CountVectorizer(stop_words='english')

# Fit the data (build the vocabulary)
bow.fit(text_data)

# Get the vocabulary list (feature names)
feature_names = bow.get_feature_names_out()

# Transform the text data into a bag-of-words representation
bow_features = bow.transform(text_data)

# Convert to an array and create a DataFrame
bow_features_array = bow_features.toarray()
bow_df = pd.DataFrame(bow_features_array, columns=feature_names)

# Display the DataFrame
print(bow_df)


   extraction  feature  good  important  interested  nlp  topic  tutorial
0           0        0     0          0           1    1      0         0
1           0        0     2          0           0    0      1         1
2           1        1     0          1           0    0      1         0


In [66]:
# get the vocabulary list
feature_names

array(['extraction', 'feature', 'good', 'important', 'interested', 'nlp',
       'topic', 'tutorial'], dtype=object)

In [67]:
bow_features = bow.transform(text_data)
bow_features

<3x8 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [68]:
bow_feature_array = bow_features.toarray()
bow_feature_array

array([[0, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 2, 0, 0, 0, 1, 1],
       [1, 1, 0, 1, 0, 0, 1, 0]], dtype=int64)

In [69]:
print(feature_names)
for sentence, feature in zip(text_data, bow_feature_array):
    print(sentence)
    print(feature)

['extraction' 'feature' 'good' 'important' 'interested' 'nlp' 'topic'
 'tutorial']
I am interested in NLP
[0 0 0 0 1 1 0 0]
This is a good tutorial with good topic
[0 0 2 0 0 0 1 1]
Feature extraction is very important topic
[1 1 0 1 0 0 1 0]


## TF-IDF (Term Frequency/Inverse Document Frequency)

TF-IDF stands for term frequency-inverse document frequency and it is a measure, used in the fields of information retrieval (IR) and machine learning, that can quantify the importance or relevance of string representations (words, phrases, lemmas, etc)  in a document amongst a collection of documents

In [70]:
text_data = ['I am interested in NLP', 'This is a good tutorial with good topic', 'Feature extraction is very important topic']

In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')

In [72]:
# fit the data
tfidf.fit(text_data)

In [73]:
# get the vocabulary list
tfidf.vocabulary_

{'interested': 4,
 'nlp': 5,
 'good': 2,
 'tutorial': 7,
 'topic': 6,
 'feature': 1,
 'extraction': 0,
 'important': 3}

In [74]:
tfidf_features = tfidf.transform(text_data)
tfidf_features

<3x8 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [75]:
tfidf_feature_array = tfidf_features.toarray()
tfidf_feature_array

array([[0.        , 0.        , 0.        , 0.        , 0.70710678,
        0.70710678, 0.        , 0.        ],
       [0.        , 0.        , 0.84678897, 0.        , 0.        ,
        0.        , 0.32200242, 0.42339448],
       [0.52863461, 0.52863461, 0.        , 0.52863461, 0.        ,
        0.        , 0.40204024, 0.        ]])

In [76]:
for sentence, feature in zip(text_data, tfidf_features):
    print(sentence)
    print(feature)

I am interested in NLP
  (0, 5)	0.7071067811865476
  (0, 4)	0.7071067811865476
This is a good tutorial with good topic
  (0, 7)	0.42339448341195934
  (0, 6)	0.3220024178194947
  (0, 2)	0.8467889668239187
Feature extraction is very important topic
  (0, 6)	0.4020402441612698
  (0, 3)	0.5286346066596935
  (0, 1)	0.5286346066596935
  (0, 0)	0.5286346066596935


## Word2vec

The word2vec algorithm uses a neural network model to learn word associations from a large corpus of text. Once trained, such a model can detect synonymous words or suggest additional words for a partial sentence.

In [77]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [78]:
# text data
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [79]:

# Define and train a Word2Vec model
model = Word2Vec(sentences=common_texts, vector_size=100, min_count=1)


In [80]:
# initialize and fit the data
#model = Word2Vec(common_texts, size=100, min_count=1)


from gensim.models import Word2Vec

# Assuming common_texts is a list of tokenized sentences
model = Word2Vec(sentences=common_texts, vector_size=100, min_count=1)


In [81]:
model.wv['graph']

array([-8.6196875e-03,  3.6657380e-03,  5.1898835e-03,  5.7419385e-03,
        7.4669183e-03, -6.1676754e-03,  1.1056137e-03,  6.0472824e-03,
       -2.8400505e-03, -6.1735227e-03, -4.1022300e-04, -8.3689485e-03,
       -5.6000124e-03,  7.1045388e-03,  3.3525396e-03,  7.2256695e-03,
        6.8002474e-03,  7.5307419e-03, -3.7891543e-03, -5.6180597e-04,
        2.3483764e-03, -4.5190323e-03,  8.3887316e-03, -9.8581640e-03,
        6.7646410e-03,  2.9144168e-03, -4.9328315e-03,  4.3981876e-03,
       -1.7395747e-03,  6.7113843e-03,  9.9648498e-03, -4.3624435e-03,
       -5.9933780e-04, -5.6956373e-03,  3.8508223e-03,  2.7866268e-03,
        6.8910765e-03,  6.1010956e-03,  9.5384968e-03,  9.2734173e-03,
        7.8980681e-03, -6.9895042e-03, -9.1558648e-03, -3.5575271e-04,
       -3.0998408e-03,  7.8943167e-03,  5.9385742e-03, -1.5456629e-03,
        1.5109634e-03,  1.7900408e-03,  7.8175711e-03, -9.5101865e-03,
       -2.0553112e-04,  3.4691966e-03, -9.3897223e-04,  8.3817719e-03,
      

In [82]:
model.wv.most_similar('graph')

[('user', 0.06793875247240067),
 ('survey', 0.03364057466387749),
 ('eps', 0.009391162544488907),
 ('human', 0.008315935730934143),
 ('minors', 0.0045030261389911175),
 ('system', -0.010839177295565605),
 ('trees', -0.023671656847000122),
 ('computer', -0.09575343877077103),
 ('time', -0.11410722136497498),
 ('response', -0.11557211726903915)]

## Word Embedding using Glove

GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space

Download link: https://www.kaggle.com/datasets/danielwillgeorge/glove6b100dtxt

In [83]:
import pandas as pd
import string
from nltk.corpus import stopwords
df = pd.read_csv('data/Twitter Sentiments.csv')
# drop the columns
df = df.drop(columns=['id', 'label'], axis=1)

df['clean_text'] = df['tweet'].str.lower()

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])
df['clean_text'] = df['clean_text'].apply(lambda x: remove_stopwords(x))

import re
def remove_spl_chars(text):
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text
df['clean_text'] = df['clean_text'].apply(lambda x: remove_spl_chars(x))

df.head()

Unnamed: 0,tweet,clean_text
0,@user when a father is dysfunctional and is s...,user father dysfunctional selfish drags kids ...
1,@user @user thanks for #lyft credit i can't us...,user user thanks lyft credit can t use cause ...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model love u take u time ur
4,factsguide: society now #motivation,factsguide society motivation


In [84]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [85]:
# tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['clean_text'])

word_index = tokenizer.word_index
vocab_size = len(word_index)
vocab_size

39085

In [86]:
# word_index

In [87]:
max(len(data) for data in df['clean_text'])

131

In [88]:
# padding text data
sequences = tokenizer.texts_to_sequences(df['clean_text'])
padded_seq = pad_sequences(sequences, maxlen=131, padding='post', truncating='post')

In [89]:
padded_seq[0]

array([    1,    28, 15330,  2630,  6365,   184,  7786,   385,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [90]:
# create embedding index
embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs
        

In [91]:
'''
# create embedding index
embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs
        
'''        
import numpy as np

# Define a function to load GloVe word vectors
def load_glove_vectors(file_path):
    embedding_index = {}
    with open(file_path, encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embedding_index[word] = coefs
    return embedding_index

# Load GloVe word vectors
glove_file_path = 'glove.6B.100d.txt'  # Replace with the actual path
embedding_index = load_glove_vectors(glove_file_path)

# Example usage:
word_vector = embedding_index.get('example')  # Get the vector for the word 'example'
print(word_vector)


[-0.12617    0.61724    0.22581    0.39868    0.16111    0.1523
 -0.14715   -0.29447   -0.27348   -0.13753   -0.20898   -0.73436
  0.14144    0.15048    0.09179    0.018613   0.22539    0.15979
 -0.16935    0.42716    0.042284  -0.3477    -0.11413    0.12222
 -0.025027  -0.20805   -0.067264  -0.2956    -0.30807   -0.32903
  0.19059    0.77141   -0.19332   -0.31069    0.26745    0.32231
  0.2065     0.10497    0.49425   -0.38322   -0.12802   -0.069906
 -0.14828    0.085369  -0.18141    0.14688    0.60968   -0.21131
 -0.29148   -0.52773    0.59508    0.017369   0.15342    0.81925
 -0.20643   -2.0378    -0.11884   -0.16826    1.5288     0.15756
 -0.4994     0.39305    0.12672   -0.10968    1.3671    -0.21006
  0.15684    0.0063801  0.43836   -0.18765   -0.29088    0.18619
  0.085402   0.13985    0.40794   -0.14811    0.26702   -0.19142
 -0.6189     0.0091217  0.34971   -0.24079   -0.52476   -0.25071
 -1.5681     0.22101    0.046796  -0.62616   -0.043358  -0.42865
 -0.0057843 -0.22611    0

In [92]:
embedding_index['good']

array([-0.030769 ,  0.11993  ,  0.53909  , -0.43696  , -0.73937  ,
       -0.15345  ,  0.081126 , -0.38559  , -0.68797  , -0.41632  ,
       -0.13183  , -0.24922  ,  0.441    ,  0.085919 ,  0.20871  ,
       -0.063582 ,  0.062228 , -0.051234 , -0.13398  ,  1.1418   ,
        0.036526 ,  0.49029  , -0.24567  , -0.412    ,  0.12349  ,
        0.41336  , -0.48397  , -0.54243  , -0.27787  , -0.26015  ,
       -0.38485  ,  0.78656  ,  0.1023   , -0.20712  ,  0.40751  ,
        0.32026  , -0.51052  ,  0.48362  , -0.0099498, -0.38685  ,
        0.034975 , -0.167    ,  0.4237   , -0.54164  , -0.30323  ,
       -0.36983  ,  0.082836 , -0.52538  , -0.064531 , -1.398    ,
       -0.14873  , -0.35327  , -0.1118   ,  1.0912   ,  0.095864 ,
       -2.8129   ,  0.45238  ,  0.46213  ,  1.6012   , -0.20837  ,
       -0.27377  ,  0.71197  , -1.0754   , -0.046974 ,  0.67479  ,
       -0.065839 ,  0.75824  ,  0.39405  ,  0.15507  , -0.64719  ,
        0.32796  , -0.031748 ,  0.52899  , -0.43886  ,  0.6740

In [93]:
# create embedding matrix
embedding_matrix = np.zeros((vocab_size+1, 100))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [94]:
embedding_matrix.shape

(39086, 100)

# Named Entity Recognition

In [95]:
 #!pip install -U pip setuptools wheel
# !pip install -U spacy
# !python -m spacy download en_core_web_sm

In [96]:
import spacy
from spacy import displacy

In [97]:
NER = spacy.load('en_core_web_sm')

In [98]:
text = 'Mark Zuckerberg is one of the founders of Facebook, a company from the United States'

In [99]:
ner_text = NER(text)

In [100]:
for word in ner_text.ents:
    print(word.text, word.label_)

Mark Zuckerberg PERSON
one CARDINAL
Facebook ORG
the United States GPE


In [101]:
spacy.explain('GPE')

'Countries, cities, states'

In [102]:
spacy.explain('CARDINAL')

'Numerals that do not fall under another type'

In [103]:
displacy.render(ner_text, style='ent', jupyter=True)