In [11]:
import numpy as np
import pandas as pd
import nltk

from nltk.corpus import stopwords # stopwords corpus
from nltk.stem import PorterStemmer # stemmer

from sklearn.feature_extraction.text import CountVectorizer # for BOW
from sklearn.feature_extraction.text import TfidfVectorizer # for TF-IDF
from gensim.models import Word2Vec # for Word2Vec

In [14]:
import re
snow = nltk.stem.SnowballStemmer('english')

In [12]:
cleanr = re.compile('<.*?>') # remove html tag
s = '<heml><body> .!sss.</body></html>'
s = re.sub(cleanr, ' ', s)
print(s)

   .!sss.  


In [13]:
# remove puncuations
s = re.sub(r'[?|!|\'|"|#]',r'',s)
s = re.sub(r'[.|,|)|(|\|/]',r' ',s)
print(s)

    sss   


In [9]:
# remove stopwords and stemming
words = [snow.stem(word) for word in s.split() if word not in stopwords.words('english')] 

   sss  


In [16]:
# test for join
words = ['hello', 'world']
sentence = (' ').join(words)
print(sentence)

hello world


In [22]:
# encoding BOW
corpus = ['hello world', 'i like pizza', 'the future is not our to see']
count_vect = CountVectorizer(max_features=5000)
bow_data = count_vect.fit_transform(corpus)
print(bow_data)

  (0, 1)	1
  (0, 10)	1
  (1, 3)	1
  (1, 6)	1
  (2, 8)	1
  (2, 0)	1
  (2, 2)	1
  (2, 4)	1
  (2, 5)	1
  (2, 9)	1
  (2, 7)	1


In [23]:
# train with Bi-Gram
count_vect = CountVectorizer(ngram_range=(1,2))
Bigram_data = count_vect.fit_transform(corpus)
print(Bigram_data)

  (0, 2)	1
  (0, 18)	1
  (0, 3)	1
  (1, 6)	1
  (1, 12)	1
  (1, 7)	1
  (2, 14)	1
  (2, 0)	1
  (2, 4)	1
  (2, 8)	1
  (2, 10)	1
  (2, 16)	1
  (2, 13)	1
  (2, 15)	1
  (2, 1)	1
  (2, 5)	1
  (2, 9)	1
  (2, 11)	1
  (2, 17)	1


In [25]:
# train tf-idf
tf_idf = TfidfVectorizer(max_features=5000)
tf_data = tf_idf.fit_transform(corpus)
print(tf_data)

  (0, 10)	0.7071067811865476
  (0, 1)	0.7071067811865476
  (1, 6)	0.7071067811865476
  (1, 3)	0.7071067811865476
  (2, 7)	0.37796447300922725
  (2, 9)	0.37796447300922725
  (2, 5)	0.37796447300922725
  (2, 4)	0.37796447300922725
  (2, 2)	0.37796447300922725
  (2, 0)	0.37796447300922725
  (2, 8)	0.37796447300922725


In [26]:
# word2vec
w2v_data = corpus
splitted = []
for row in w2v_data:
    splitted.append([word for word in row.split()]) # splitting words

In [29]:
print(splitted)

[['hello', 'world'], ['i', 'like', 'pizza'], ['the', 'future', 'is', 'not', 'our', 'to', 'see']]


In [32]:
train_w2v = Word2Vec(splitted, min_count = 1, size = 10, workers =4 ) # default min_count = 5

In [33]:
avg_data = []
for row in splitted:
    vec = np.zeros(10)
    count = 0
    for word in row:
        try:
            vec += train_w2v[word]
            count += 1
        except:
            pass
    avg_data.append(vec/count)

  import sys


In [34]:
print(avg_data)

[array([ 0.0049132 ,  0.00257161,  0.00578847, -0.0263087 ,  0.00094173,
        0.00911294, -0.03673651,  0.03076983,  0.01136076,  0.00464733]), array([-0.04138814, -0.00521938,  0.01968945, -0.00489506, -0.00946177,
       -0.01072774, -0.03500661,  0.00763235, -0.02062522,  0.03836296]), array([ 0.00302578,  0.00668292, -0.00654146, -0.00297376,  0.01710409,
        0.00249151, -0.01167992,  0.00896452,  0.0009047 ,  0.00048868])]


                    V = ( t(W1)*w2v(W1) + t(W2)*w2v(W2) +.....+t(Wn)*w2v(Wn))/(t(W1)+t(W2)+....+t(Wn))

In [44]:
# TF-IDF word2vec
tf_w_data = corpus
tf_idf = TfidfVectorizer(max_features=5000)
tf_idf_data = tf_idf.fit_transform(tf_w_data)
print(tf_idf.get_feature_names())

['future', 'hello', 'is', 'like', 'not', 'our', 'pizza', 'see', 'the', 'to', 'world']


In [45]:
tf_w_data = []
tf_idf_data  = tf_idf_data.toarray()
vocab = tf_idf.get_feature_names()
for i, row in zip(range(len(splitted)),splitted):
    vec = [0] * 10
    
    tf_idf_sum = 0
    for val in tf_idf_data[i]:
        if val != 0:
            tf_idf_sum += val
            try:
                vec += (val * train_w2v[vocab[i]])
            except:
                pass
    
    vec = (float)(1 / tf_idf_sum) * vec
    tf_w_data.append(vec)

    
print(tf_w_data)
        

[array([ 0.04207142, -0.00975782, -0.01198703,  0.03148258,  0.01852668,
       -0.0347459 , -0.00663106,  0.0412101 , -0.04673776,  0.0484617 ]), array([ 0.04185987,  0.0347254 ,  0.04314079, -0.02122546,  0.01860863,
        0.03253822, -0.02990762,  0.04165432, -0.01679715, -0.00473316]), array([-0.00444083,  0.03311056,  0.00554849, -0.02299906,  0.04853451,
        0.02387154, -0.00649266, -0.03070207, -0.0243659 , -0.04736215])]


  if sys.path[0] == '':
