# NLP - Sentiment Analysis

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Disable Warnings
import warnings
warnings.filterwarnings('ignore')

# NLP functionalities and libraries
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Classification evaluation
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_score, 
                             recall_score, f1_score)

# Word Cloud
from wordcloud import WordCloud, STOPWORDS

In [2]:
# Importing the dataset
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', 
                      quoting = 3)

In [3]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][0])
review

'Wow    Loved this place '

In [5]:
review = review.lower()
review = review.split()
review

['wow', 'loved', 'this', 'place']

In [6]:
stopword_list=stopwords.words('english')

In [7]:
stopword_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [6]:
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review

['wow', 'love', 'place']

In [12]:
review = ' '.join(review)
review

'wow love place'

In [9]:
# Cleaning the texts
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])    #Remove Punctuations
    review = review.lower()                                    #lower case coversion
    review = review.split()                                    #split the words basis of space 
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))] #steming of word and check for stopword
    review = ' '.join(review)           # Agin join the word
    corpus.append(review)

In [10]:
corpus[:5]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price']

## Bag of Words Model

In [11]:
# Creating the Bag of Words model with Count Vectors
cv = CountVectorizer(max_features = 500)

In [12]:
#vocab = cv.vocabulary_
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

In [13]:
vocab_cv = cv.vocabulary_

In [14]:
len(vocab_cv)

500

In [16]:
X.shape

(1000, 500)

In [17]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [18]:
# Logistic Reegression
from sklearn.linear_model import LogisticRegression
lr_cv = LogisticRegression()
lr_cv.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [19]:
cv_coeffs = lr_cv.coef_
cv_coeffs.shape

(1, 500)

In [20]:
# Predicting the Test set results
y_pred = lr_cv.predict(X_test)

In [21]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,roc_auc_score,roc_curve
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_pred)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.68      0.79      0.73        97
          1       0.77      0.64      0.70       103

avg / total       0.72      0.71      0.71       200



In [22]:
cm

array([[77, 20],
       [37, 66]])

In [23]:
ac

0.715

In [24]:
# Creating the Bag of Words model Tf-Idf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features = 500)
X = tfidf.fit_transform(corpus).toarray()
vocab_tf = tfidf.vocabulary_
y = dataset.iloc[:, 1].values

In [25]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [26]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
lr_tf = LogisticRegression()
lr_tf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [27]:
tf_coeffs = lr_tf.coef_

In [28]:
# Predicting the Test set results
y_pred2 = lr_tf.predict(X_test)

In [29]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,roc_auc_score,roc_curve
cm2 = confusion_matrix(y_test, y_pred2)
ac2 = accuracy_score(y_test, y_pred2)
roc2 = roc_auc_score(y_test, y_pred2)
print(classification_report(y_test, y_pred2))

             precision    recall  f1-score   support

          0       0.70      0.84      0.76        97
          1       0.81      0.66      0.73       103

avg / total       0.76      0.74      0.74       200



In [30]:
cm2

array([[81, 16],
       [35, 68]])

In [31]:
ac2

0.745

In [37]:
def get_feat_dict(feat_list):
    feat_dict={}
    idx=0
    for item in feat_list:
        feat_dict[item] = idx
        idx = idx+1
    return feat_dict    

In [38]:
cv_feat_dict = get_feat_dict(cv.get_feature_names())

In [39]:
tf_feat_dict = get_feat_dict(tfidf.get_feature_names())

In [40]:
cv_feat_dict

{'absolut': 0,
 'actual': 1,
 'ago': 2,
 'also': 3,
 'although': 4,
 'alway': 5,
 'amaz': 6,
 'ambianc': 7,
 'ambienc': 8,
 'amount': 9,
 'anoth': 10,
 'anytim': 11,
 'anyway': 12,
 'appet': 13,
 'area': 14,
 'around': 15,
 'arriv': 16,
 'ask': 17,
 'ate': 18,
 'atmospher': 19,
 'attent': 20,
 'authent': 21,
 'averag': 22,
 'avoid': 23,
 'aw': 24,
 'away': 25,
 'awesom': 26,
 'back': 27,
 'bacon': 28,
 'bad': 29,
 'bar': 30,
 'bare': 31,
 'bartend': 32,
 'bathroom': 33,
 'bay': 34,
 'bean': 35,
 'beat': 36,
 'beauti': 37,
 'beef': 38,
 'beer': 39,
 'believ': 40,
 'belli': 41,
 'best': 42,
 'better': 43,
 'big': 44,
 'bill': 45,
 'biscuit': 46,
 'bit': 47,
 'bite': 48,
 'bland': 49,
 'boot': 50,
 'bother': 51,
 'boy': 52,
 'boyfriend': 53,
 'bread': 54,
 'break': 55,
 'breakfast': 56,
 'brick': 57,
 'bring': 58,
 'brought': 59,
 'brunch': 60,
 'buck': 61,
 'buffet': 62,
 'bug': 63,
 'build': 64,
 'burger': 65,
 'busi': 66,
 'butter': 67,
 'bye': 68,
 'cafe': 69,
 'call': 70,
 'came': 71

In [41]:
tf_feat_dict

{'absolut': 0,
 'actual': 1,
 'ago': 2,
 'also': 3,
 'although': 4,
 'alway': 5,
 'amaz': 6,
 'ambianc': 7,
 'ambienc': 8,
 'amount': 9,
 'anoth': 10,
 'anytim': 11,
 'anyway': 12,
 'appet': 13,
 'area': 14,
 'around': 15,
 'arriv': 16,
 'ask': 17,
 'ate': 18,
 'atmospher': 19,
 'attent': 20,
 'authent': 21,
 'averag': 22,
 'avoid': 23,
 'aw': 24,
 'away': 25,
 'awesom': 26,
 'back': 27,
 'bacon': 28,
 'bad': 29,
 'bar': 30,
 'bare': 31,
 'bartend': 32,
 'bathroom': 33,
 'bay': 34,
 'bean': 35,
 'beat': 36,
 'beauti': 37,
 'beef': 38,
 'beer': 39,
 'believ': 40,
 'belli': 41,
 'best': 42,
 'better': 43,
 'big': 44,
 'bill': 45,
 'biscuit': 46,
 'bit': 47,
 'bite': 48,
 'bland': 49,
 'boot': 50,
 'bother': 51,
 'boy': 52,
 'boyfriend': 53,
 'bread': 54,
 'break': 55,
 'breakfast': 56,
 'brick': 57,
 'bring': 58,
 'brought': 59,
 'brunch': 60,
 'buck': 61,
 'buffet': 62,
 'bug': 63,
 'build': 64,
 'burger': 65,
 'busi': 66,
 'butter': 67,
 'bye': 68,
 'cafe': 69,
 'call': 70,
 'came': 71

## Sentiment Analysis

In [42]:
# Contextual Sentiments for Count Vectors
sentiList_cv = []
threshold = 0.5
for word, index in vocab_cv.items():
    weight = cv_coeffs[0][cv_feat_dict[word]]
    if weight > threshold or weight < -threshold:
        sentiList_cv.append((word, weight))

In [43]:
len(sentiList_cv)

146

In [44]:
sentiList_cv

[('wow', 0.618388781667119),
 ('love', 1.7035792004158434),
 ('good', 1.3784250206997748),
 ('nasti', -0.6839629888121632),
 ('recommend', 0.546906673209992),
 ('select', 0.7197242793497878),
 ('menu', 0.5551111520858948),
 ('great', 2.828060416635246),
 ('could', 0.6614087800695668),
 ('would', -0.9543768566819758),
 ('overpr', -0.9639185120759433),
 ('disgust', -0.6203396215498528),
 ('sure', -0.58226925627261),
 ('slow', -1.158824815100142),
 ('let', -0.9262157000018496),
 ('amaz', 1.7224460147619924),
 ('also', 0.5294902282554369),
 ('beauti', 0.729308776140174),
 ('never', -0.7554426671239963),
 ('friendli', 1.5889533800916191),
 ('hour', -0.5702547757830319),
 ('tabl', -0.5165514476901426),
 ('total', -0.5180455777539302),
 ('worst', -1.3869718125441934),
 ('sashimi', -0.5266535779225993),
 ('burger', -0.5008161715491848),
 ('beer', 0.5623014562071114),
 ('look', -1.0971356691960803),
 ('elsewher', -0.5719089440680409),
 ('poor', -1.1323474769033328),
 ('everi', 0.676958886530527

In [45]:
# Contextual Sentiments for TfIdf Vectors
sentiList_tf = []
threshold = 0.5
for word, index in vocab_tf.items():
    weight = tf_coeffs[0][tf_feat_dict[word]]
    if weight > threshold or weight < -threshold:
        sentiList_tf.append((word, weight))

In [46]:
len(sentiList_tf)

129

In [47]:
sentiList_tf

[('wow', 0.5510644035480017),
 ('love', 2.102604222837774),
 ('place', 0.8517245948539903),
 ('good', 1.9047087012272155),
 ('nasti', -0.5668258030678287),
 ('stop', 0.5487022935843582),
 ('select', 0.9332443088646626),
 ('menu', 0.7890070412532237),
 ('great', 3.4211566599135534),
 ('like', -0.5851552802339097),
 ('could', 0.5569165512955999),
 ('would', -1.2302098848691367),
 ('overpr', -0.9637325747382763),
 ('tri', 0.5499426141691383),
 ('disgust', -0.6229307174175439),
 ('slow', -0.986460142638835),
 ('let', -0.6176362475315781),
 ('amaz', 1.907951975096041),
 ('also', 0.7551545431703611),
 ('beauti', 0.6598931079313897),
 ('never', -0.7479683197599607),
 ('friendli', 1.690656886074996),
 ('staff', 0.5215503797301149),
 ('restaur', 0.5316949464666503),
 ('total', -0.5832968589382863),
 ('worst', -1.3619465212016542),
 ('sashimi', -0.5515855082231219),
 ('beer', 0.6917498174292362),
 ('look', -0.8622412339100478),
 ('poor', -1.0393414167200157),
 ('everi', 0.6488824332253675),
 ('f

In [85]:
[word[0] for word in sentiList_tf]

['wow',
 'love',
 'place',
 'good',
 'nasti',
 'stop',
 'select',
 'menu',
 'great',
 'like',
 'could',
 'would',
 'overpr',
 'tri',
 'disgust',
 'slow',
 'let',
 'amaz',
 'also',
 'beauti',
 'never',
 'friendli',
 'staff',
 'restaur',
 'total',
 'worst',
 'sashimi',
 'beer',
 'look',
 'poor',
 'everi',
 'first',
 'delight',
 'suck',
 'tender',
 'establish',
 'hard',
 'gross',
 'eat',
 'sick',
 'dessert',
 'bad',
 'order',
 'insid',
 'nice',
 'enjoy',
 'wonder',
 'imagin',
 'much',
 'tasteless',
 'think',
 'minut',
 'delici',
 'definit',
 'alway',
 'got',
 'way',
 'realli',
 'meh',
 'noth',
 'sweet',
 'buffet',
 'wast',
 'wait',
 'old',
 'bland',
 'meat',
 'everyth',
 'disappoint',
 'best',
 'breakfast',
 'arriv',
 'fantast',
 'town',
 'ambienc',
 'busi',
 'spici',
 'check',
 'atmospher',
 'bit',
 'know',
 'manag',
 'spot',
 'lack',
 'reason',
 'ambianc',
 'return',
 'mediocr',
 'excel',
 'rude',
 'serious',
 'extrem',
 'done',
 'stale',
 'unfortun',
 'impress',
 'avoid',
 'hand',
 'ba

In [48]:
len(set([word[0] for word in sentiList_tf]).intersection(set([word[0] for word in sentiList_cv])))

114

In [49]:
polarized_dict = get_feat_dict(list(set([word[0] for word in sentiList_tf]).intersection(set([word[0] for word in sentiList_cv]))))

In [50]:
# Creating the Bag of Words model with Count Vectors on polarized words
cv_polarized = CountVectorizer(vocabulary = polarized_dict)

In [51]:
X = cv_polarized.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

In [52]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [53]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
lr_cv_polarized = LogisticRegression()
lr_cv_polarized.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [54]:
# Predicting the Test set results
y_pred_cv_polarized = lr_cv_polarized.predict(X_test)

In [55]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,roc_auc_score,roc_curve
cm2 = confusion_matrix(y_test, y_pred_cv_polarized)
ac2 = accuracy_score(y_test, y_pred_cv_polarized)
roc2 = roc_auc_score(y_test, y_pred_cv_polarized)
print(classification_report(y_test, y_pred_cv_polarized))

             precision    recall  f1-score   support

          0       0.66      0.87      0.75        97
          1       0.82      0.57      0.67       103

avg / total       0.74      0.71      0.71       200



In [56]:
ac2

0.715

In [57]:
cm2

array([[84, 13],
       [44, 59]])

## Document Classification and Embeddings

**Word Embedding is a representation of text where words that have the same meaning have a similar representation. In other words it represents words in a coordinate system where related words, based on a corpus of relationships, are placed closer together. In the deep learning frameworks such as TensorFlow, Keras, this part is usually handled by an embedding layer which stores a lookup table to map the words represented by numeric indexes to their dense vector representations.**

In [1]:
import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
# Others
import re
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

from sklearn.manifold import TSNE

from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup as soup
from nltk.stem.snowball import SnowballStemmer

In [2]:
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model, Model
from keras.layers import Flatten, Dropout, Activation, Input, Dense, concatenate
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
df = pd.read_csv('bbc-text.csv')
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [4]:
df['category'].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['category'] = le.fit_transform(df['category'])

In [7]:
df.head()

Unnamed: 0,category,text
0,4,tv future in the hands of viewers with home th...
1,0,worldcom boss left books alone former worldc...
2,3,tigers wary of farrell gamble leicester say ...
3,3,yeading face newcastle in fa cup premiership s...
4,1,ocean s twelve raids box office ocean s twelve...


In [8]:
le.classes_

array(['business', 'entertainment', 'politics', 'sport', 'tech'],
      dtype=object)

In [9]:
dummy_y = pd.get_dummies(df['category']).values
dummy_y[:10]

array([[0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0]], dtype=uint8)

In [10]:
dummy_y.shape

(2225, 5)

In [11]:
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    ## Stemming
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    return text

In [12]:
# apply the above function to df['text']
df['text'] = df['text'].map(lambda x: clean_text(x))

In [13]:
df.head()

Unnamed: 0,category,text
0,4,futur hand viewer home theatr system plasma hi...
1,0,worldcom boss left book alon former worldcom b...
2,3,tiger wari farrel gambl leicest say rush make ...
3,3,yead face newcastl cup premiership side newcas...
4,1,ocean twelv raid box offic ocean twelv crime c...


In [14]:
df['text'][0]

'futur hand viewer home theatr system plasma high - definit tvs digit video record move live room way peopl watch radic differ five year time accord expert panel gather annual consum electron show las vega discuss new technolog impact one favourit pastim lead trend programm content deliv viewer via home network cabl satellit telecom compani broadband servic provid front room portabl devic one talk - about technolog ces digit person video record dvr pvr set - top box like tivo sky + system allow peopl record store play paus forward wind programm want essenti technolog allow much personalis tv also built - in high - definit set big busi japan slower take europ lack high - definit program peopl forward wind advert also forget abid network channel schedul put togeth a - la - cart entertain network cabl satellit compani worri mean term advertis revenu well brand ident viewer loyalti channel although lead technolog moment also concern rais europ particular grow uptak servic like sky + happen

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'].values, dummy_y, test_size = 0.25, random_state = 101)

In [16]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocabulary_size = len(tokenizer.word_index) + 1
vocabulary_size

18158

In [18]:
tokenizer.word_index

{'said': 1,
 'year': 2,
 'would': 3,
 'also': 4,
 'peopl': 5,
 'new': 6,
 'one': 7,
 'game': 8,
 'time': 9,
 'use': 10,
 'say': 11,
 'could': 12,
 'make': 13,
 'last': 14,
 'first': 15,
 'govern': 16,
 'two': 17,
 'like': 18,
 'play': 19,
 'world': 20,
 'film': 21,
 'take': 22,
 'get': 23,
 'work': 24,
 'compani': 25,
 'back': 26,
 'show': 27,
 'firm': 28,
 'best': 29,
 'music': 30,
 'want': 31,
 'win': 32,
 'month': 33,
 'number': 34,
 'told': 35,
 'report': 36,
 'plan': 37,
 'set': 38,
 'servic': 39,
 'come': 40,
 'made': 41,
 'way': 42,
 'countri': 43,
 'market': 44,
 'player': 45,
 'ad': 46,
 'includ': 47,
 '000': 48,
 'need': 49,
 'mani': 50,
 'parti': 51,
 'three': 52,
 'week': 53,
 'elect': 54,
 'next': 55,
 'well': 56,
 'look': 57,
 'labour': 58,
 'minist': 59,
 'bbc': 60,
 'expect': 61,
 'nation': 62,
 'call': 63,
 'sale': 64,
 'day': 65,
 '1': 66,
 'home': 67,
 'think': 68,
 'second': 69,
 'good': 70,
 'technolog': 71,
 'see': 72,
 'help': 73,
 'million': 74,
 'award': 75,
 '

In [19]:
trainsequences = tokenizer.texts_to_sequences(X_train)
print(trainsequences)

[[11864, 1458, 13, 32, 79, 1255, 1458, 81, 170, 189, 124, 66, 263, 233, 438, 2182, 429, 4432, 6086, 1337, 2294, 1072, 502, 104, 34, 7, 55, 19, 1580, 620, 773, 81, 2183, 213, 417, 516, 20, 34, 7, 355, 6087, 5094, 233, 263, 124, 66, 124, 230, 2616, 2037, 664, 5534, 535, 1060, 11865, 2617, 6757, 124, 66, 263, 124, 263, 230, 915, 789, 355, 3731, 4134, 539, 7741, 6758, 124, 66, 124, 230, 1458, 41, 826, 79, 2, 4135, 4432, 727, 178, 233, 230, 1367, 2525, 6759, 32, 55, 17, 8, 69, 32, 136, 178, 2182, 203, 79, 2, 57, 1458, 35, 247, 2, 250, 429, 631, 19, 5095, 207, 727, 56, 6088, 430, 9166, 2038, 631, 226, 9167, 56, 475, 23, 84, 208, 355, 2618, 6760, 444, 70, 481, 7742, 56, 735, 2, 1458, 4, 707, 3168, 26, 160, 646, 888, 302, 2457, 80, 2, 1136, 6761, 31, 961, 53, 754, 1, 1458, 79, 665, 232, 232, 3405, 213, 98, 946, 17, 53, 68, 148, 620, 517, 1773, 15, 38, 88, 513, 81, 332, 5094, 520, 8, 23, 10, 2122, 1, 1176, 98, 4731, 7743, 9, 12, 727, 262, 827, 178, 70, 309, 65, 101, 19, 262, 55, 178, 2037, 1520

In [15]:
X_train.shape

(1668,)

In [20]:
len(trainsequences)

1668

In [21]:
trainsequences[0]

[11864,
 1458,
 13,
 32,
 79,
 1255,
 1458,
 81,
 170,
 189,
 124,
 66,
 263,
 233,
 438,
 2182,
 429,
 4432,
 6086,
 1337,
 2294,
 1072,
 502,
 104,
 34,
 7,
 55,
 19,
 1580,
 620,
 773,
 81,
 2183,
 213,
 417,
 516,
 20,
 34,
 7,
 355,
 6087,
 5094,
 233,
 263,
 124,
 66,
 124,
 230,
 2616,
 2037,
 664,
 5534,
 535,
 1060,
 11865,
 2617,
 6757,
 124,
 66,
 263,
 124,
 263,
 230,
 915,
 789,
 355,
 3731,
 4134,
 539,
 7741,
 6758,
 124,
 66,
 124,
 230,
 1458,
 41,
 826,
 79,
 2,
 4135,
 4432,
 727,
 178,
 233,
 230,
 1367,
 2525,
 6759,
 32,
 55,
 17,
 8,
 69,
 32,
 136,
 178,
 2182,
 203,
 79,
 2,
 57,
 1458,
 35,
 247,
 2,
 250,
 429,
 631,
 19,
 5095,
 207,
 727,
 56,
 6088,
 430,
 9166,
 2038,
 631,
 226,
 9167,
 56,
 475,
 23,
 84,
 208,
 355,
 2618,
 6760,
 444,
 70,
 481,
 7742,
 56,
 735,
 2,
 1458,
 4,
 707,
 3168,
 26,
 160,
 646,
 888,
 302,
 2457,
 80,
 2,
 1136,
 6761,
 31,
 961,
 53,
 754,
 1,
 1458,
 79,
 665,
 232,
 232,
 3405,
 213,
 98,
 946,
 17,
 53,
 68,
 148,
 6

In [22]:
MAXLEN = 250

In [23]:
trainseqs = pad_sequences(trainsequences, maxlen=MAXLEN, padding='post')
print(trainseqs)

[[11864  1458    13 ...     0     0     0]
 [  809    23  9168 ...     0     0     0]
 [  269    76  2774 ...     0     0     0]
 ...
 [  226  3199   573 ...     0     0     0]
 [ 5925 18146  2836 ...     0     0     0]
 [ 2949   306  1193 ...  1624   184   150]]


In [24]:
trainseqs.shape

(1668, 250)

In [25]:
testsequences = tokenizer.texts_to_sequences(X_test)
testseqs = pad_sequences(testsequences, maxlen=MAXLEN, padding='post')

In [26]:
print(testseqs)

[[ 529  252  275 ...    0    0    0]
 [ 141 2463  167 ...  108  525    1]
 [ 957 1324 4982 ...    0    0    0]
 ...
 [  14  273  861 ... 1358 1046  476]
 [ 874  536 1339 ...    0    0    0]
 [5604  323  277 ...  116 2714  488]]


In [27]:
trainseqs.shape

(1668, 250)

In [28]:
y_test.shape

(557, 5)

In [31]:
embedding_size = 32

In [32]:
op_units = df['category'].nunique()

In [33]:
# define the model
model = Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=MAXLEN))
model.add(Flatten())
model.add(Dense(op_units, activation='softmax'))

In [34]:
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [35]:
# summarize the model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 250, 32)           581056    
_________________________________________________________________
flatten_2 (Flatten)          (None, 8000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 40005     
Total params: 621,061
Trainable params: 621,061
Non-trainable params: 0
_________________________________________________________________
None


In [36]:
# fit the model
model.fit(trainseqs, 
          y_train, 
          epochs=5,
          validation_data=(testseqs,y_test),
          verbose=2)

Train on 1668 samples, validate on 557 samples
Epoch 1/5
 - 2s - loss: 1.5057 - acc: 0.3465 - val_loss: 1.4109 - val_acc: 0.4434
Epoch 2/5
 - 1s - loss: 0.9776 - acc: 0.8225 - val_loss: 0.7916 - val_acc: 0.8348
Epoch 3/5
 - 1s - loss: 0.3059 - acc: 0.9790 - val_loss: 0.3183 - val_acc: 0.9605
Epoch 4/5
 - 1s - loss: 0.0872 - acc: 0.9988 - val_loss: 0.2052 - val_acc: 0.9731
Epoch 5/5
 - 1s - loss: 0.0387 - acc: 1.0000 - val_loss: 0.1624 - val_acc: 0.9731


<keras.callbacks.History at 0x7f6cb02e75c0>

In [37]:
# evaluate the model
loss, accuracy = model.evaluate(testseqs, y_test, verbose=2)
print('Loss: %f' % (loss))
print('Accuracy: %f' % (accuracy*100))

Loss: 0.162369
Accuracy: 97.307002


In [44]:
model.layers[0].get_weights()[0].shape

(18158, 32)

In [45]:
# Extract weights from the Embedding Layers
embeddings = model.layers[0].get_weights()[0]

# `embeddings` has a shape of (num_vocab, embedding_dim) 

# `word_to_index` is a mapping (i.e. dict) from words to 
# their index
words_embeddings = {w:embeddings[idx] for w, idx in tokenizer.word_index.items()}

In [46]:
words_embeddings['tiger']

array([ 0.05851227,  0.00641839, -0.02863956, -0.01390276, -0.01745366,
       -0.02456674, -0.0247659 , -0.04967704,  0.01430894,  0.00141364,
       -0.03670051,  0.0137229 , -0.0260178 , -0.05774996, -0.03162028,
       -0.01499818,  0.05789851,  0.04830869, -0.02433261,  0.0626341 ,
        0.05774857,  0.02380782,  0.01222595,  0.04541155,  0.00917102,
        0.03889833, -0.00480174,  0.05583175,  0.00934568,  0.03900613,
       -0.03565377,  0.0149274 ], dtype=float32)

## Pre-trained Embeddings

### Gensim Word2Vec Embeddings

In [47]:
news_corpus = df['text'].values.tolist()

In [47]:
len(df['text'])

2225

In [52]:
news_corpus = []
for i in range(len(df['text'])):
    news = df['text'][i].lower()
    news = news.split()
    news_corpus.append(news)

In [53]:
len(news_corpus)

2225

In [54]:
news_corpus[0]

['futur',
 'hand',
 'viewer',
 'home',
 'theatr',
 'system',
 'plasma',
 'high',
 '-',
 'definit',
 'tvs',
 'digit',
 'video',
 'record',
 'move',
 'live',
 'room',
 'way',
 'peopl',
 'watch',
 'radic',
 'differ',
 'five',
 'year',
 'time',
 'accord',
 'expert',
 'panel',
 'gather',
 'annual',
 'consum',
 'electron',
 'show',
 'las',
 'vega',
 'discuss',
 'new',
 'technolog',
 'impact',
 'one',
 'favourit',
 'pastim',
 'lead',
 'trend',
 'programm',
 'content',
 'deliv',
 'viewer',
 'via',
 'home',
 'network',
 'cabl',
 'satellit',
 'telecom',
 'compani',
 'broadband',
 'servic',
 'provid',
 'front',
 'room',
 'portabl',
 'devic',
 'one',
 'talk',
 '-',
 'about',
 'technolog',
 'ces',
 'digit',
 'person',
 'video',
 'record',
 'dvr',
 'pvr',
 'set',
 '-',
 'top',
 'box',
 'like',
 'tivo',
 'sky',
 '+',
 'system',
 'allow',
 'peopl',
 'record',
 'store',
 'play',
 'paus',
 'forward',
 'wind',
 'programm',
 'want',
 'essenti',
 'technolog',
 'allow',
 'much',
 'personalis',
 'tv',
 'also

In [55]:
# Word2Vec Data processing
from gensim.models import Word2Vec
model_w2v = Word2Vec(sentences=news_corpus, size=embedding_size, 
                window=5, workers=4, min_count=50)

In [56]:
model_w2v.wv.most_similar('futur')

[('worri', 0.9038824439048767),
 ('focus', 0.9015379548072815),
 ('result', 0.8911356925964355),
 ('potenti', 0.8904075622558594),
 ('possibl', 0.8901209831237793),
 ('option', 0.8888853192329407),
 ('problem', 0.8865810632705688),
 ('import', 0.8836463689804077),
 ('integr', 0.8806822299957275),
 ('longer', 0.8795821666717529)]

In [57]:
words = list(model_w2v.wv.vocab)
print("Vocabulary size: %d"% len(words))

Vocabulary size: 1853


In [58]:
filename = 'news_w2v.txt'
model_w2v.wv.save_word2vec_format(filename, binary=False)

### Use pre-trained w2v as Embeddings

In [59]:
import os

embeddings_index_w2v = {}
f = open(os.path.join('','news_w2v.txt'), encoding='utf-8')

for line in f:
    values = line.split()
    word = values[0]
    coeffs = np.asarray(values[1:])
    embeddings_index_w2v[word] = coeffs
f.close()

In [60]:
embeddings_index_w2v['said']

array(['0.6817679', '0.21362534', '0.790396', '-1.4787937', '-0.33805665',
       '-0.7519076', '0.9581482', '-0.577555', '-1.7025225',
       '-0.59295416', '-0.673398', '0.37108353', '-0.011482347',
       '-1.413212', '0.81753874', '-1.1052825', '0.8093292', '0.3042068',
       '-0.6469847', '-0.15914541', '-0.9383532', '0.8101514',
       '-0.26946765', '-1.6034023', '-0.1350425', '-0.2916982',
       '-0.8473212', '-1.2387842', '0.7106425', '0.41588354',
       '-0.28073162', '1.6814944'], dtype='<U12')

In [61]:
# Vectorize
tokenizer = Tokenizer()
tokenizer.fit_on_texts(news_corpus)
seqs = tokenizer.texts_to_sequences(news_corpus)

In [62]:
# Pad sequences
word_index = tokenizer.word_index
print("Found %d unique tokens"% len(word_index))

news_pad = pad_sequences(seqs, maxlen = 50)
news_pad.shape

Found 20660 unique tokens


(2225, 50)

In [63]:
num_words = len(word_index) + 1
embedding_matrix_w2v = np.zeros((num_words, embedding_size))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index_w2v.get(word)
    if embedding_vector is not None:
        # words not found in embeddings_index_w2v will be all zeros
        embedding_matrix_w2v[i] = embedding_vector
print(num_words)    

20661


In [64]:
# define the model
model_pt_w2v = Sequential()
embedding_layer_pt_w2v = Embedding(num_words, 
                            embedding_size,
                            embeddings_initializer = Constant(embedding_matrix_w2v),
                            input_length=MAXLEN,
                            trainable=False)
model_pt_w2v.add(embedding_layer_pt_w2v)
model_pt_w2v.add(Flatten())
model_pt_w2v.add(Dense(op_units, activation='softmax'))

In [65]:
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [66]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 250, 32)           581056    
_________________________________________________________________
flatten_2 (Flatten)          (None, 8000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 40005     
Total params: 621,061
Trainable params: 621,061
Non-trainable params: 0
_________________________________________________________________
None


In [67]:
# fit the model
model.fit(trainseqs, 
          y_train, 
          batch_size=128, 
          epochs=50, 
          validation_data=(testseqs, y_test), 
          verbose=2)

Train on 1668 samples, validate on 557 samples
Epoch 1/50
 - 1s - loss: 0.0193 - acc: 1.0000 - val_loss: 0.1183 - val_acc: 0.9713
Epoch 2/50
 - 0s - loss: 0.0064 - acc: 1.0000 - val_loss: 0.1010 - val_acc: 0.9749
Epoch 3/50
 - 0s - loss: 0.0031 - acc: 1.0000 - val_loss: 0.0929 - val_acc: 0.9767
Epoch 4/50
 - 0s - loss: 0.0019 - acc: 1.0000 - val_loss: 0.0887 - val_acc: 0.9767
Epoch 5/50
 - 0s - loss: 0.0013 - acc: 1.0000 - val_loss: 0.0869 - val_acc: 0.9767
Epoch 6/50
 - 0s - loss: 0.0010 - acc: 1.0000 - val_loss: 0.0852 - val_acc: 0.9767
Epoch 7/50
 - 0s - loss: 8.1479e-04 - acc: 1.0000 - val_loss: 0.0836 - val_acc: 0.9767
Epoch 8/50
 - 0s - loss: 6.6700e-04 - acc: 1.0000 - val_loss: 0.0827 - val_acc: 0.9767
Epoch 9/50
 - 0s - loss: 5.5949e-04 - acc: 1.0000 - val_loss: 0.0822 - val_acc: 0.9749
Epoch 10/50
 - 0s - loss: 4.7714e-04 - acc: 1.0000 - val_loss: 0.0819 - val_acc: 0.9731
Epoch 11/50
 - 0s - loss: 4.1296e-04 - acc: 1.0000 - val_loss: 0.0811 - val_acc: 0.9731
Epoch 12/50
 - 0s 

<keras.callbacks.History at 0x7f6c5f7f0cf8>