In [1]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords


In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:

paragraph = """There are not many better people in the paddock to ask about the next generation of talent as Vasseur’s ART team has helped provide a platform for no less than 25 Formula 1 drivers.

Recently, Mercedes boss Toto Wolff spoke at length about how there is a “stigma” attached to the “rich kids” who make their way up to Formula 1, but Vasseur sees another problem relating to the overall youth development structure, aside from costs."""

### 1.Stemming

In [9]:
# stemmer = PorterStemmer()
# paragraph_lowercase = paragraph.lower() # converting everything to lowercase
# sentances = nltk.sent_tokenize(paragraph_lowercase) 
# stem_words = []
# for word in words:
#   if word not in set(stopwords.words('english')): # set helps to take unique words in the english 
#     stem_words.append(stemmer.stem(word))

##### Another way #####
sentences = nltk.sent_tokenize(paragraph)
stemmer = PorterStemmer()

for i in range(len(sentences)):
  words = nltk.word_tokenize(sentences[i])
  words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
  sentences[i]=' '.join(words)

### 2. Lemmatization

In [10]:
import nltk
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords

In [11]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [12]:
#  Lets use same data/paragraph 
sentances = nltk.sent_tokenize(paragraph)
lemmetizer = WordNetLemmatizer()

# Lemmatization
for i in range(len(sentances)):
  words = nltk.word_tokenize(sentances[i])
  words = [lemmetizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
  sentances[i]=' '.join(words)

In [13]:
print(sentances), print(lemmetizer)

['There many better people paddock ask next generation talent Vasseur ’ ART team helped provide platform le 25 Formula 1 driver .', 'Recently , Mercedes bos Toto Wolff spoke length “ stigma ” attached “ rich kid ” make way Formula 1 , Vasseur see another problem relating overall youth development structure , aside cost .']
<WordNetLemmatizer>


(None, None)

### 3. Bag of words

In [14]:
# clearning the texts
import re # regual expression
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

ps = PorterStemmer()
wordnet=WordNetLemmatizer()
sentances = nltk.sent_tokenize(paragraph)
corpus = []

for i in range(len(sentances)):
  review = re.sub('[^a-zA-Z]', ' ', sentances[i]) # to remove the punctuations except lowercase and uppercase letters
  review = review.lower()  # lowering each sentances
  review = review.split()  # splitting to get just words
  review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] # to lemmatize just replace `ps.stem` with `wordnet.lemmeatize`
  review = ' '.join(review) # joining all the words
  corpus.append(review)


# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()

In [15]:
print(corpus)

['mani better peopl paddock ask next gener talent vasseur art team help provid platform less formula driver', 'recent merced boss toto wolff spoke length stigma attach rich kid make way formula vasseur see anoth problem relat overal youth develop structur asid cost']


In [16]:
import numpy as np

In [17]:
np.array(X)

array([[0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
        1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0],
       [1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1]])

### 4. TF (Term Frequency) and IDF (Inverse Document Frequency)

In [18]:
import nltk 
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
paragraph = """Envision this: there is a technology currently undergoing testing that,
 when released to the public, will become a long-awaited revolution in energy. 
 This new technology promises to be safer and more efficient than anything we have on the market now. 
 It will affect that which we consider mundane — power tools, toys, laptops, smartphones — and that which we consider exceptional — medical devices, 
 spacecraft, and the innovative new vehicle designs needed to wean us off of fossil fuels. 
 We have known about this technology for centuries, yet until now we have only been able to take small steps towards its creation. 
 Billions of dollars are pouring into research and billions more will be made once the technology has been perfected and released.
 This description may sound a lot like that of fusion power.
 Yet it’s actually referring to the upcoming innovations in the realm of battery technology — specifically that of solid-state batteries. 
 And while both fusion power and solid-state batteries have been labeled technologies of the future but never of today, advancements and investments in solid-state materials have increased tremendously over the years. 
 Today not only are there many major companies and credible researchers involved, it seems we may finally start seeing these batteries released in just the next few years.
 What can we expect once this elusive, transformative technology is finally ready for mass production?"""

#  Cleaning the texts
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

ps = PorterStemmer()
wordnet = WordNetLemmatizer()
sentences = nltk.sent_tokenize(paragraph)
corpus=[]

for i in range(len(sentences)):
  review = re.sub('[^a-zA-Z]', ' ', sentences[i])
  review = review.lower()
  review = review.split()
  review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
  review = ' '.join(review)
  corpus.append(review)


# Creating TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()


In [20]:
print(corpus)
print(X)

['envision technology currently undergoing testing released public become long awaited revolution energy', 'new technology promise safer efficient anything market', 'affect consider mundane power tool toy laptop smartphones consider exceptional medical device spacecraft innovative new vehicle design needed wean u fossil fuel', 'known technology century yet able take small step towards creation', 'billion dollar pouring research billion made technology perfected released', 'description may sound lot like fusion power', 'yet actually referring upcoming innovation realm battery technology specifically solid state battery', 'fusion power solid state battery labeled technology future never today advancement investment solid state material increased tremendously year', 'today many major company credible researcher involved seems may finally start seeing battery released next year', 'expect elusive transformative technology finally ready mass production']
[[0.         0.         0.         0.

### 5. Word2Vec

In [21]:
import nltk 

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [23]:
from gensim.models import Word2Vec
from nltk.corpus import stopwords
import re

paragraph = """Bitcoin is making every investment/asset in history look stupid.
  I own Bitcoin. I got involved early on. People either love me or hate me for this somewhat crazy technological choice. 
  They sometimes write misinformed articles about my belief in the future of money, 
  while simultaneously closing their mind to new ideas. They think I care about Bitcoin opinions. 
  I don’t. My only goal in life is to make people think. 
  Getting people to agree with my point of view is the stuff of mind control. 
  I can’t brainwash you. Sorry.
  People think the price of each coin hitting $100,000 USD is the end game. 
  While the rise in value is nice for us early adopters of Bitcoin, it completely misses the point."""

# Preprocessing the data
text = re.sub(r'\[[0-9]*\]', ' ', paragraph)
text = re.sub(r'\s+', ' ', text)
text = text.lower()
text = re.sub(r'\d', ' ', text)
text = re.sub(r'\s+', ' ', text)


In [24]:
# Preparing the dataset
sentences = nltk.sent_tokenize(text)
sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

for i in range(len(sentences)):
  sentences[i] = [word for word in sentences[i] if not word in stopwords.words('english')]

In [25]:
# Train using Word2Vec model
model = Word2Vec(sentences, min_count=1)

words = model.wv.vocab

In [26]:
words

{'$': <gensim.models.keyedvectors.Vocab at 0x7f284fa56310>,
 ',': <gensim.models.keyedvectors.Vocab at 0x7f284fa43810>,
 '.': <gensim.models.keyedvectors.Vocab at 0x7f284fa43d90>,
 'adopters': <gensim.models.keyedvectors.Vocab at 0x7f284fa562d0>,
 'agree': <gensim.models.keyedvectors.Vocab at 0x7f284fa43a50>,
 'articles': <gensim.models.keyedvectors.Vocab at 0x7f284fa43190>,
 'belief': <gensim.models.keyedvectors.Vocab at 0x7f284fa43c50>,
 'bitcoin': <gensim.models.keyedvectors.Vocab at 0x7f284fa43410>,
 'brainwash': <gensim.models.keyedvectors.Vocab at 0x7f284fa56650>,
 'care': <gensim.models.keyedvectors.Vocab at 0x7f284fa43ed0>,
 'choice': <gensim.models.keyedvectors.Vocab at 0x7f284fa43550>,
 'closing': <gensim.models.keyedvectors.Vocab at 0x7f284fa43990>,
 'coin': <gensim.models.keyedvectors.Vocab at 0x7f284fa568d0>,
 'completely': <gensim.models.keyedvectors.Vocab at 0x7f284fa56090>,
 'control': <gensim.models.keyedvectors.Vocab at 0x7f284fa56cd0>,
 'crazy': <gensim.models.keyedv

In [29]:
# Finding word vectors
vector = model.wv['view']
vector

array([ 7.6725602e-04, -1.7696236e-03,  3.9815530e-03,  1.0404565e-03,
       -1.3128838e-04, -1.6689724e-03, -1.5948603e-03, -7.3631236e-04,
        3.8388439e-03,  2.6152490e-04, -2.4074361e-04,  2.6793587e-03,
       -4.5152074e-03,  3.6151374e-03,  1.1038407e-03, -2.7412008e-03,
        6.4331474e-04, -3.3890754e-03,  3.2397637e-03, -1.5141587e-03,
       -3.3721393e-03, -4.2415187e-03,  1.3548544e-03,  1.9184416e-05,
        1.8588571e-03,  5.9485174e-04,  7.4373093e-04, -2.8879836e-03,
        7.2319788e-05, -1.3552633e-03, -1.0380957e-03, -1.5804833e-03,
        4.7145197e-03,  2.5191777e-03, -1.6820604e-03,  1.6405924e-03,
        4.2212917e-03, -1.3754317e-03,  3.8334485e-03,  2.3435978e-03,
        2.3956527e-03,  2.6366513e-03,  2.0661934e-03, -2.1207624e-03,
        1.5358522e-03,  3.2304873e-05,  8.9689792e-04,  5.8031327e-04,
        4.8245424e-03,  4.6290499e-03,  9.9466136e-04, -4.5451424e-03,
       -4.9547297e-03,  9.4313070e-04,  2.8637843e-03,  4.1948929e-03,
      

In [30]:
# Most similar words
similar = model.wv.most_similar('view')
similar

[('love', 0.18416070938110352),
 ('’', 0.1742231845855713),
 ('us', 0.12811179459095),
 ('history', 0.12286540120840073),
 ('closing', 0.09380356222391129),
 ('crazy', 0.0891285091638565),
 ('stuff', 0.08174648880958557),
 ('early', 0.0796428918838501),
 ('coin', 0.07124675810337067),
 ('future', 0.06775816529989243)]

### Word Embedding Techniques using Embedding Layer in Keras

In [31]:
# Import tensorflow
from tensorflow.keras.preprocessing.text import one_hot

In [32]:
sent = ['It usually rains every day here',
        'It smells very delicious in the kitchen',
        'We generally sing songs all together',
        'We go to a gallery every Sunday',
        'Does he write an email?']

In [33]:
sent

['It usually rains every day here',
 'It smells very delicious in the kitchen',
 'We generally sing songs all together',
 'We go to a gallery every Sunday',
 'Does he write an email?']

In [34]:
# define vocabulary size
voc_size = 1000

**One Hot Representation**

In [36]:
onehot_rep = [one_hot(words, voc_size) for words in sent]
print(onehot_rep)

[[882, 837, 837, 855, 229, 880], [882, 294, 461, 800, 63, 155, 150], [371, 667, 200, 357, 159, 535], [371, 620, 690, 284, 829, 855, 780], [442, 644, 862, 635, 600]]


**Word Embedding Representation**

In [38]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [39]:
import numpy as np

In [40]:
sent_length = 8
embedding_docs = pad_sequences(onehot_rep,padding='pre',maxlen=sent_length)
print(embedding_docs)

[[  0   0 882 837 837 855 229 880]
 [  0 882 294 461 800  63 155 150]
 [  0   0 371 667 200 357 159 535]
 [  0 371 620 690 284 829 855 780]
 [  0   0   0 442 644 862 635 600]]


In [41]:
dim=10

In [42]:
model = Sequential()
model.add(Embedding(voc_size, dim, input_length=sent_length))
model.compile('adam', 'mse')

In [43]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 8, 10)             10000     
Total params: 10,000
Trainable params: 10,000
Non-trainable params: 0
_________________________________________________________________


In [44]:
print(model.predict(embedding_docs))

[[[-3.26945186e-02 -1.91906691e-02 -1.02996826e-03 -1.68036222e-02
    3.81129645e-02 -1.44845732e-02 -4.78689335e-02 -1.81671754e-02
   -4.93158028e-03 -8.42235982e-04]
  [-3.26945186e-02 -1.91906691e-02 -1.02996826e-03 -1.68036222e-02
    3.81129645e-02 -1.44845732e-02 -4.78689335e-02 -1.81671754e-02
   -4.93158028e-03 -8.42235982e-04]
  [ 2.80674584e-02 -4.30134684e-03 -3.55893373e-02  4.36229222e-02
   -2.91627403e-02 -4.56208698e-02  2.39551552e-02 -1.82641745e-02
   -2.39156839e-02  2.26258300e-02]
  [ 4.99832630e-03  1.06609240e-02 -4.08761017e-02  4.57189195e-02
    2.13344134e-02  1.04671232e-02 -6.85144216e-04 -4.49803360e-02
    1.58023834e-03  2.81788819e-02]
  [ 4.99832630e-03  1.06609240e-02 -4.08761017e-02  4.57189195e-02
    2.13344134e-02  1.04671232e-02 -6.85144216e-04 -4.49803360e-02
    1.58023834e-03  2.81788819e-02]
  [ 2.49967314e-02  6.15186617e-03 -2.18088552e-03 -3.94382365e-02
    1.08745098e-02 -4.36380282e-02 -7.83944130e-03  9.84154642e-04
    2.80908234e-

In [46]:
embedding_docs[0]

array([  0,   0, 882, 837, 837, 855, 229, 880], dtype=int32)

In [47]:
print(model.predict(embedding_docs)[0])

[[-0.03269452 -0.01919067 -0.00102997 -0.01680362  0.03811296 -0.01448457
  -0.04786893 -0.01816718 -0.00493158 -0.00084224]
 [-0.03269452 -0.01919067 -0.00102997 -0.01680362  0.03811296 -0.01448457
  -0.04786893 -0.01816718 -0.00493158 -0.00084224]
 [ 0.02806746 -0.00430135 -0.03558934  0.04362292 -0.02916274 -0.04562087
   0.02395516 -0.01826417 -0.02391568  0.02262583]
 [ 0.00499833  0.01066092 -0.0408761   0.04571892  0.02133441  0.01046712
  -0.00068514 -0.04498034  0.00158024  0.02817888]
 [ 0.00499833  0.01066092 -0.0408761   0.04571892  0.02133441  0.01046712
  -0.00068514 -0.04498034  0.00158024  0.02817888]
 [ 0.02499673  0.00615187 -0.00218089 -0.03943824  0.01087451 -0.04363803
  -0.00783944  0.00098415  0.02809082 -0.02227134]
 [ 0.00076833 -0.00239173  0.04930682  0.00814048  0.04574451 -0.04352764
   0.03316158  0.04123307  0.04786963 -0.02085506]
 [-0.04765438  0.00360823  0.02750287  0.01657918  0.00733574 -0.02113495
   0.01260174 -0.04451722 -0.00753672 -0.01353322]]