# NLP

In [2]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Disable Warnings
import warnings
warnings.filterwarnings('ignore')

# NLP functionalities and libraries
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Classification evaluation
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_score, 
                             recall_score, f1_score)

# Word Cloud
#from wordcloud import WordCloud, STOPWORDS

#### Importing the dataset

In [3]:

dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', 
                      quoting = 3)

In [4]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [5]:
review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][0])
review

'Wow    Loved this place '

In [6]:
review = review.lower()
review = review.split()
review

['wow', 'loved', 'this', 'place']

In [9]:
#nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sachin.gupta\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [10]:
stopword_list=stopwords.words('english')

In [6]:
ps = PorterStemmer()
review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review

['wow', 'love', 'place']

In [7]:
review = ' '.join(review)
review

'wow love place'

#### Cleaning the texts

In [11]:
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [13]:
corpus[:5]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price']

## Bag of Words Model

In [14]:
# Creating the Bag of Words model with Count Vectors
cv = CountVectorizer(max_features = 800)

In [15]:
#vocab = cv.vocabulary_
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

In [16]:
vocab_cv = cv.vocabulary_

In [17]:
vocab_cv

{'wow': 792,
 'love': 406,
 'place': 505,
 'crust': 163,
 'good': 310,
 'tasti': 712,
 'textur': 718,
 'nasti': 449,
 'stop': 681,
 'late': 384,
 'may': 420,
 'rick': 555,
 'steve': 674,
 'recommend': 544,
 'select': 592,
 'menu': 429,
 'great': 313,
 'price': 521,
 'get': 303,
 'want': 768,
 'damn': 168,
 'pho': 499,
 'tast': 710,
 'fresh': 292,
 'potato': 517,
 'like': 394,
 'rubber': 569,
 'could': 152,
 'tell': 714,
 'made': 411,
 'time': 728,
 'kept': 375,
 'fri': 293,
 'touch': 737,
 'servic': 596,
 'would': 791,
 'go': 307,
 'back': 44,
 'cashier': 110,
 'care': 109,
 'ever': 248,
 'say': 583,
 'still': 676,
 'end': 239,
 'overpr': 478,
 'tri': 741,
 'chicken': 120,
 'mmmm': 437,
 'disgust': 212,
 'pretti': 520,
 'sure': 699,
 'human': 354,
 'hair': 324,
 'sign': 605,
 'highli': 339,
 'waitress': 765,
 'littl': 397,
 'slow': 617,
 'worth': 790,
 'let': 391,
 'vega': 756,
 'food': 290,
 'amaz': 9,
 'also': 6,
 'cute': 167,
 'beauti': 59,
 'right': 557,
 'red': 545,
 'cake': 102,


In [21]:
len(vocab_cv)

800

In [22]:
X.shape

(1000, 800)

In [23]:
X[0,:]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [25]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [26]:
# Logistic Reegression
from sklearn.linear_model import LogisticRegression
lr_cv = LogisticRegression()
lr_cv.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [62]:
cv_coeffs = lr_cv.coef_
cv_coeffs.shape

(1, 800)

In [63]:
cv_coeffs[0]

array([-5.94470429e-02, -5.61094221e-02, -5.62442838e-02,  3.22852055e-01,
       -2.61735079e-02, -2.51271324e-01,  4.54712392e-01, -2.92879665e-01,
        4.33586657e-01,  1.76047785e+00,  5.32576886e-01,  4.95837017e-01,
        1.93805910e-01, -3.81831961e-01, -6.74344696e-02,  5.49583345e-01,
       -7.17251921e-01, -2.63598823e-01, -9.49874146e-02,  3.07743962e-01,
        3.87902303e-02, -6.79677621e-02, -6.72201541e-01, -3.41166383e-02,
       -2.04446018e-01,  5.17157985e-01, -2.71785282e-02,  4.51705987e-01,
       -2.70901648e-02, -4.30586463e-01, -4.00190385e-01,  3.41270485e-01,
       -3.06341504e-01,  0.00000000e+00,  3.31668124e-01, -7.54949995e-01,
       -2.02449073e-01, -9.14649206e-01, -5.96800997e-01, -3.95487379e-01,
        1.38342080e+00,  0.00000000e+00, -1.37700508e-02,  4.48970369e-01,
       -2.86663537e-01,  9.76214414e-01, -1.41038345e+00,  2.53709855e-02,
       -1.97457784e-01,  4.61244558e-01, -3.91924840e-01,  1.89494057e-01,
       -3.14817562e-01,  

In [27]:
# Predicting the Test set results
y_pred = lr_cv.predict(X_test)

In [28]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,roc_auc_score,roc_curve
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_pred)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.78      0.72        97
           1       0.76      0.64      0.69       103

    accuracy                           0.71       200
   macro avg       0.72      0.71      0.71       200
weighted avg       0.72      0.71      0.71       200



In [29]:
cm

array([[76, 21],
       [37, 66]], dtype=int64)

In [30]:
ac

0.71

In [31]:
# Creating the Bag of Words model Tf-Idf
tfidf = TfidfVectorizer(max_features = 800)
X = tfidf.fit_transform(corpus).toarray()
vocab_tf = tfidf.vocabulary_
y = dataset.iloc[:, 1].values

In [32]:
X[0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [33]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [34]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
lr_tf = LogisticRegression()
lr_tf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [35]:
tf_coeffs = lr_tf.coef_

In [36]:
# Predicting the Test set results
y_pred2 = lr_tf.predict(X_test)

In [37]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,roc_auc_score,roc_curve
cm2 = confusion_matrix(y_test, y_pred2)
ac2 = accuracy_score(y_test, y_pred2)
roc2 = roc_auc_score(y_test, y_pred2)
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.69      0.86      0.76        97
           1       0.82      0.64      0.72       103

    accuracy                           0.74       200
   macro avg       0.76      0.75      0.74       200
weighted avg       0.76      0.74      0.74       200



In [38]:
cm2

array([[83, 14],
       [37, 66]], dtype=int64)

In [39]:
ac2

0.745

In [40]:
common_vocab = list(set(list(vocab_cv.keys())).intersection(set(list(vocab_tf.keys()))))

In [41]:
len(common_vocab)

800

In [45]:
def get_feat_dict(feat_list):
    feat_dict={}
    idx=0
    for item in feat_list:
        feat_dict[item] = idx
        idx = idx+1
    return feat_dict    

In [46]:
cv_feat_dict = get_feat_dict(cv.get_feature_names())

In [47]:
tf_feat_dict = get_feat_dict(tfidf.get_feature_names())

In [48]:
vocab_cv

{'wow': 792,
 'love': 406,
 'place': 505,
 'crust': 163,
 'good': 310,
 'tasti': 712,
 'textur': 718,
 'nasti': 449,
 'stop': 681,
 'late': 384,
 'may': 420,
 'rick': 555,
 'steve': 674,
 'recommend': 544,
 'select': 592,
 'menu': 429,
 'great': 313,
 'price': 521,
 'get': 303,
 'want': 768,
 'damn': 168,
 'pho': 499,
 'tast': 710,
 'fresh': 292,
 'potato': 517,
 'like': 394,
 'rubber': 569,
 'could': 152,
 'tell': 714,
 'made': 411,
 'time': 728,
 'kept': 375,
 'fri': 293,
 'touch': 737,
 'servic': 596,
 'would': 791,
 'go': 307,
 'back': 44,
 'cashier': 110,
 'care': 109,
 'ever': 248,
 'say': 583,
 'still': 676,
 'end': 239,
 'overpr': 478,
 'tri': 741,
 'chicken': 120,
 'mmmm': 437,
 'disgust': 212,
 'pretti': 520,
 'sure': 699,
 'human': 354,
 'hair': 324,
 'sign': 605,
 'highli': 339,
 'waitress': 765,
 'littl': 397,
 'slow': 617,
 'worth': 790,
 'let': 391,
 'vega': 756,
 'food': 290,
 'amaz': 9,
 'also': 6,
 'cute': 167,
 'beauti': 59,
 'right': 557,
 'red': 545,
 'cake': 102,


In [49]:
cv_feat_dict

{'absolut': 0,
 'acknowledg': 1,
 'actual': 2,
 'ad': 3,
 'ago': 4,
 'almost': 5,
 'also': 6,
 'although': 7,
 'alway': 8,
 'amaz': 9,
 'ambianc': 10,
 'ambienc': 11,
 'amount': 12,
 'anoth': 13,
 'anyon': 14,
 'anyth': 15,
 'anytim': 16,
 'anyway': 17,
 'apolog': 18,
 'appet': 19,
 'area': 20,
 'around': 21,
 'arriv': 22,
 'articl': 23,
 'ask': 24,
 'assur': 25,
 'ate': 26,
 'atmospher': 27,
 'atroci': 28,
 'attach': 29,
 'attack': 30,
 'attent': 31,
 'attitud': 32,
 'auju': 33,
 'authent': 34,
 'averag': 35,
 'avocado': 36,
 'avoid': 37,
 'aw': 38,
 'away': 39,
 'awesom': 40,
 'awkward': 41,
 'babi': 42,
 'bachi': 43,
 'back': 44,
 'bacon': 45,
 'bad': 46,
 'bagel': 47,
 'bakeri': 48,
 'bar': 49,
 'bare': 50,
 'bartend': 51,
 'basic': 52,
 'batch': 53,
 'bathroom': 54,
 'batter': 55,
 'bay': 56,
 'bean': 57,
 'beat': 58,
 'beauti': 59,
 'becom': 60,
 'beef': 61,
 'beer': 62,
 'behind': 63,
 'believ': 64,
 'belli': 65,
 'best': 66,
 'better': 67,
 'beyond': 68,
 'big': 69,
 'bill': 70

In [50]:
tf_feat_dict

{'absolut': 0,
 'acknowledg': 1,
 'actual': 2,
 'ad': 3,
 'ago': 4,
 'almost': 5,
 'also': 6,
 'although': 7,
 'alway': 8,
 'amaz': 9,
 'ambianc': 10,
 'ambienc': 11,
 'amount': 12,
 'anoth': 13,
 'anyon': 14,
 'anyth': 15,
 'anytim': 16,
 'anyway': 17,
 'apolog': 18,
 'appet': 19,
 'area': 20,
 'around': 21,
 'arriv': 22,
 'articl': 23,
 'ask': 24,
 'assur': 25,
 'ate': 26,
 'atmospher': 27,
 'atroci': 28,
 'attach': 29,
 'attack': 30,
 'attent': 31,
 'attitud': 32,
 'auju': 33,
 'authent': 34,
 'averag': 35,
 'avocado': 36,
 'avoid': 37,
 'aw': 38,
 'away': 39,
 'awesom': 40,
 'awkward': 41,
 'babi': 42,
 'bachi': 43,
 'back': 44,
 'bacon': 45,
 'bad': 46,
 'bagel': 47,
 'bakeri': 48,
 'bar': 49,
 'bare': 50,
 'bartend': 51,
 'basic': 52,
 'batch': 53,
 'bathroom': 54,
 'batter': 55,
 'bay': 56,
 'bean': 57,
 'beat': 58,
 'beauti': 59,
 'becom': 60,
 'beef': 61,
 'beer': 62,
 'behind': 63,
 'believ': 64,
 'belli': 65,
 'best': 66,
 'better': 67,
 'beyond': 68,
 'big': 69,
 'bill': 70

## Sentiment Analysis

In [64]:
cv_coeffs[0]

array([-5.94470429e-02, -5.61094221e-02, -5.62442838e-02,  3.22852055e-01,
       -2.61735079e-02, -2.51271324e-01,  4.54712392e-01, -2.92879665e-01,
        4.33586657e-01,  1.76047785e+00,  5.32576886e-01,  4.95837017e-01,
        1.93805910e-01, -3.81831961e-01, -6.74344696e-02,  5.49583345e-01,
       -7.17251921e-01, -2.63598823e-01, -9.49874146e-02,  3.07743962e-01,
        3.87902303e-02, -6.79677621e-02, -6.72201541e-01, -3.41166383e-02,
       -2.04446018e-01,  5.17157985e-01, -2.71785282e-02,  4.51705987e-01,
       -2.70901648e-02, -4.30586463e-01, -4.00190385e-01,  3.41270485e-01,
       -3.06341504e-01,  0.00000000e+00,  3.31668124e-01, -7.54949995e-01,
       -2.02449073e-01, -9.14649206e-01, -5.96800997e-01, -3.95487379e-01,
        1.38342080e+00,  0.00000000e+00, -1.37700508e-02,  4.48970369e-01,
       -2.86663537e-01,  9.76214414e-01, -1.41038345e+00,  2.53709855e-02,
       -1.97457784e-01,  4.61244558e-01, -3.91924840e-01,  1.89494057e-01,
       -3.14817562e-01,  

In [65]:
# Contextual Sentiments for Count Vectors
sentiList_cv = []
threshold = 0.5
for word, index in vocab_cv.items():
    weight = cv_coeffs[0][cv_feat_dict[word]]
    if weight > threshold or weight < -threshold:
        sentiList_cv.append((word, weight))

In [66]:
len(sentiList_cv)

157

In [67]:
sentiList_cv

[('wow', 0.548481263346505),
 ('love', 1.649664391934063),
 ('good', 1.389451232572293),
 ('nasti', -0.6582501337984253),
 ('recommend', 0.6630012059207112),
 ('select', 0.7283474600937245),
 ('menu', 0.5629671793238409),
 ('great', 2.8120077400425836),
 ('could', 0.6172725656004414),
 ('would', -1.0362964684053497),
 ('overpr', -0.9033141145344573),
 ('disgust', -0.611752241961287),
 ('sure', -0.5336779203406519),
 ('sign', -0.6997996242775838),
 ('slow', -1.146326677015529),
 ('let', -0.9015830186791481),
 ('amaz', 1.7604778455393633),
 ('cute', 0.6630558992412364),
 ('beauti', 0.6367057539222921),
 ('never', -0.8051046309918073),
 ('friendli', 1.5726861147950468),
 ('hour', -0.5642198904695445),
 ('tabl', -0.507090505638842),
 ('total', -0.5170433880784036),
 ('worst', -1.3262905138653105),
 ('burger', -0.5221528762545875),
 ('beer', 0.5512099375985268),
 ('favor', -0.563700604722426),
 ('look', -1.0505633664267473),
 ('elsewher', -0.51495275046101),
 ('inexpens', 0.5291378965008193

In [68]:
# Contextual Sentiments for TfIdf Vectors
sentiList_tf = []
threshold = 0.5
for word, index in vocab_tf.items():
    weight = tf_coeffs[0][tf_feat_dict[word]]
    if weight > threshold or weight < -threshold:
        sentiList_tf.append((word, weight))

In [69]:
len(sentiList_tf)

127

In [70]:
sentiList_tf

[('love', 2.054451159709322),
 ('place', 0.7864700638202643),
 ('good', 1.8965369518517652),
 ('nasti', -0.5579547487663885),
 ('stop', 0.5286926756209503),
 ('select', 0.9315573667778193),
 ('menu', 0.7889585563194407),
 ('great', 3.436207553670508),
 ('like', -0.6035021577277421),
 ('would', -1.2835494793459397),
 ('back', -0.5881386703554209),
 ('overpr', -0.9257982403194037),
 ('tri', 0.5561829338287412),
 ('disgust', -0.6214273246093673),
 ('sign', -0.5519318142953481),
 ('slow', -1.005776474348022),
 ('let', -0.603473232805371),
 ('amaz', 1.927315738369261),
 ('also', 0.6014541240826266),
 ('cute', 0.5322550978097917),
 ('beauti', 0.5774274599582191),
 ('never', -1.0024956570759322),
 ('friendli', 1.6404496421140058),
 ('total', -0.5751667841425459),
 ('worst', -1.3421737499597404),
 ('beer', 0.684181788916334),
 ('look', -0.8372815786738633),
 ('poor', -1.0426791169241307),
 ('everi', 0.629601565875173),
 ('first', 0.9134931566392146),
 ('delight', 0.6605677904947117),
 ('suck',

In [71]:
[word[0] for word in sentiList_tf]

['love',
 'place',
 'good',
 'nasti',
 'stop',
 'select',
 'menu',
 'great',
 'like',
 'would',
 'back',
 'overpr',
 'tri',
 'disgust',
 'sign',
 'slow',
 'let',
 'amaz',
 'also',
 'cute',
 'beauti',
 'never',
 'friendli',
 'total',
 'worst',
 'beer',
 'look',
 'poor',
 'everi',
 'first',
 'delight',
 'suck',
 'tender',
 'establish',
 'hard',
 'gross',
 'eat',
 'sick',
 'dessert',
 'bad',
 'insid',
 'nice',
 'enjoy',
 'wonder',
 'imagin',
 'much',
 'tasteless',
 'think',
 'minut',
 'delici',
 'definit',
 'alway',
 'got',
 'way',
 'realli',
 'meh',
 'noth',
 'sweet',
 'buffet',
 'wast',
 'wait',
 'old',
 'bland',
 'meat',
 'die',
 'everyth',
 'disappoint',
 'best',
 'breakfast',
 'arriv',
 'fantast',
 'town',
 'ambienc',
 'busi',
 'spici',
 'check',
 'bit',
 'know',
 'manag',
 'spot',
 'lack',
 'ambianc',
 'return',
 'mediocr',
 'excel',
 'rude',
 'serious',
 'extrem',
 'done',
 'stale',
 'unfortun',
 'impress',
 'avoid',
 'hand',
 'bacon',
 'sad',
 'zero',
 'probabl',
 'ice',
 'bread',

In [72]:
len(set([word[0] for word in sentiList_tf]).intersection(set([word[0] for word in sentiList_cv])))

114

In [73]:
polarized_dict = get_feat_dict(list(set([word[0] for word in sentiList_tf]).intersection(set([word[0] for word in sentiList_cv]))))

In [74]:
len(polarized_dict)

114

In [75]:
polarized_dict

{'meat': 0,
 'happi': 1,
 'anytim': 2,
 'suck': 3,
 'avoid': 4,
 'locat': 5,
 'busi': 6,
 'breakfast': 7,
 'nicest': 8,
 'disgust': 9,
 'amaz': 10,
 'great': 11,
 'ice': 12,
 'either': 13,
 'sick': 14,
 'zero': 15,
 'sign': 16,
 'stale': 17,
 'look': 18,
 'buffet': 19,
 'eat': 20,
 'defin': 21,
 'enjoy': 22,
 'love': 23,
 'select': 24,
 'poor': 25,
 'rude': 26,
 'aw': 27,
 'cute': 28,
 'bad': 29,
 'town': 30,
 'ambianc': 31,
 'good': 32,
 'minut': 33,
 'check': 34,
 'omg': 35,
 'wonder': 36,
 'disappoint': 37,
 'bit': 38,
 'delight': 39,
 'menu': 40,
 'nasti': 41,
 'much': 42,
 'money': 43,
 'noth': 44,
 'averag': 45,
 'extrem': 46,
 'long': 47,
 'got': 48,
 'lack': 49,
 'way': 50,
 'fun': 51,
 'best': 52,
 'beauti': 53,
 'tasteless': 54,
 'unfortun': 55,
 'think': 56,
 'excel': 57,
 'return': 58,
 'rate': 59,
 'hard': 60,
 'dirti': 61,
 'tender': 62,
 'spot': 63,
 'plu': 64,
 'dessert': 65,
 'done': 66,
 'slow': 67,
 'beer': 68,
 'gross': 69,
 'imagin': 70,
 'insult': 71,
 'fantast': 

In [76]:
# Creating the Bag of Words model with Count Vectors on polarized words
cv_polarized = CountVectorizer(vocabulary = polarized_dict)

In [77]:
X = cv_polarized.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

In [78]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [79]:
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
lr_cv_polarized = LogisticRegression()
lr_cv_polarized.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [80]:
# Predicting the Test set results
y_pred_cv_polarized = lr_cv_polarized.predict(X_test)

In [81]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,roc_auc_score,roc_curve
cm2 = confusion_matrix(y_test, y_pred_cv_polarized)
ac2 = accuracy_score(y_test, y_pred_cv_polarized)
roc2 = roc_auc_score(y_test, y_pred_cv_polarized)
print(classification_report(y_test, y_pred_cv_polarized))

              precision    recall  f1-score   support

           0       0.66      0.90      0.76        97
           1       0.85      0.56      0.68       103

    accuracy                           0.73       200
   macro avg       0.76      0.73      0.72       200
weighted avg       0.76      0.72      0.72       200



In [82]:
ac2

0.725

In [83]:
cm2

array([[87, 10],
       [45, 58]], dtype=int64)

## Document Classification and Embeddings

**Word Embedding is a representation of text where words that have the same meaning have a similar representation. In other words it represents words in a coordinate system where related words, based on a corpus of relationships, are placed closer together. In the deep learning frameworks such as TensorFlow, Keras, this part is usually handled by an embedding layer which stores a lookup table to map the words represented by numeric indexes to their dense vector representations.**

In [3]:
!pip install tensorflow

Collecting tensorflow
  Using cached https://files.pythonhosted.org/packages/34/d5/ce8c17971067c0184c9045112b755be5461d5ce5253ef65a367e1298d7c5/tensorflow-2.1.0-cp37-cp37m-win_amd64.whl
Collecting gast==0.2.2 (from tensorflow)
Collecting absl-py>=0.7.0 (from tensorflow)
Collecting tensorboard<2.2.0,>=2.1.0 (from tensorflow)
  Using cached https://files.pythonhosted.org/packages/d9/41/bbf49b61370e4f4d245d4c6051dfb6db80cec672605c91b1652ac8cc3d38/tensorboard-2.1.1-py3-none-any.whl
Collecting termcolor>=1.1.0 (from tensorflow)
Collecting grpcio>=1.8.6 (from tensorflow)
  Using cached https://files.pythonhosted.org/packages/a7/6d/99aba8db04bf58193ed157dfe7e848494b93dd8aa3f6a4d1edfef318779c/grpcio-1.27.2-cp37-cp37m-win_amd64.whl
Collecting markdown>=2.6.8 (from tensorboard<2.2.0,>=2.1.0->tensorflow)
  Using cached https://files.pythonhosted.org/packages/ab/c4/ba46d44855e6eb1770a12edace5a165a0c6de13349f592b9036257f3c3d3/Markdown-3.2.1-py2.py3-none-any.whl
Collecting google-auth-oauthlib<0.5,>

In [1]:
import gensim
from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
# Others
import re
import nltk
import string
import numpy as np
import pandas as pd
from nltk.corpus import stopwords

from sklearn.manifold import TSNE

from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup as soup
from nltk.stem.snowball import SnowballStemmer

In [2]:
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model, Model
from keras.utils import plot_model
from keras.layers import Flatten, Dropout, Activation, Input, Dense, concatenate
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

Using TensorFlow backend.
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow_core\python\pywrap_tensorflow.py", line 58, in <module>
    from tensorflow.python.pywrap_tensorflow_internal import *
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow_core\python\pywrap_tensorflow_internal.py", line 28, in <module>
    _pywrap_tensorflow_internal = swig_import_helper()
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow_core\python\pywrap_tensorflow_internal.py", line 24, in swig_import_helper
    _mod = imp.load_module('_pywrap_tensorflow_internal', fp, pathname, description)
  File "C:\ProgramData\Anaconda3\lib\imp.py", line 242, in load_module
    return load_dynamic(name, filename, file)
  File "C:\ProgramData\Anaconda3\lib\imp.py", line 342, in load_dynamic
    return _load(spec)
ImportError: DLL load failed: The specified module could not be found.

During handling of the above exception, another exception occurred:

Traceback (most recent call

  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow_core\python\pywrap_tensorflow_internal.py", line 28, in <module>
    _pywrap_tensorflow_internal = swig_import_helper()
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow_core\python\pywrap_tensorflow_internal.py", line 24, in swig_import_helper
    _mod = imp.load_module('_pywrap_tensorflow_internal', fp, pathname, description)
  File "C:\ProgramData\Anaconda3\lib\imp.py", line 242, in load_module
    return load_dynamic(name, filename, file)
  File "C:\ProgramData\Anaconda3\lib\imp.py", line 342, in load_dynamic
    return _load(spec)
ImportError: DLL load failed: The specified module could not be found.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3325, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-2-ba1988a0e00b>", line

TypeError: can only concatenate str (not "list") to str

In [3]:
df = pd.read_csv('bbc-text.csv')
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [4]:
df['category'].value_counts()

sport            511
business         510
politics         417
tech             401
entertainment    386
Name: category, dtype: int64

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['category'] = le.fit_transform(df['category'])

In [6]:
df.head()

Unnamed: 0,category,text
0,4,tv future in the hands of viewers with home th...
1,0,worldcom boss left books alone former worldc...
2,3,tigers wary of farrell gamble leicester say ...
3,3,yeading face newcastle in fa cup premiership s...
4,1,ocean s twelve raids box office ocean s twelve...


In [11]:
le.classes_

array(['business', 'entertainment', 'politics', 'sport', 'tech'],
      dtype=object)

In [7]:
dummy_y = pd.get_dummies(df['category']).values
dummy_y[:10]

array([[0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 1, 0, 0, 0]], dtype=uint8)

In [8]:
dummy_y.shape

(2225, 5)

In [9]:
def clean_text(text):
    
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)
    ## Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    ## Stemming
    text = text.split()
    stemmer = SnowballStemmer('english')
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    return text

In [10]:
# apply the above function to df['text']
df['text'] = df['text'].map(lambda x: clean_text(x))

In [12]:
df.head()

Unnamed: 0,category,text
0,4,futur hand viewer home theatr system plasma hi...
1,0,worldcom boss left book alon former worldcom b...
2,3,tiger wari farrel gambl leicest say rush make ...
3,3,yead face newcastl cup premiership side newcas...
4,1,ocean twelv raid box offic ocean twelv crime c...


In [13]:
df['text'][0]

'futur hand viewer home theatr system plasma high - definit tvs digit video record move live room way peopl watch radic differ five year time accord expert panel gather annual consum electron show las vega discuss new technolog impact one favourit pastim lead trend programm content deliv viewer via home network cabl satellit telecom compani broadband servic provid front room portabl devic one talk - about technolog ces digit person video record dvr pvr set - top box like tivo sky + system allow peopl record store play paus forward wind programm want essenti technolog allow much personalis tv also built - in high - definit set big busi japan slower take europ lack high - definit program peopl forward wind advert also forget abid network channel schedul put togeth a - la - cart entertain network cabl satellit compani worri mean term advertis revenu well brand ident viewer loyalti channel although lead technolog moment also concern rais europ particular grow uptak servic like sky + happen

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'].values, dummy_y, test_size = 0.25, random_state = 101)

In [15]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocabulary_size = len(tokenizer.word_index) + 1
vocabulary_size

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3325, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-7ab7cb886988>", line 1, in <module>
    tokenizer = Tokenizer()
NameError: name 'Tokenizer' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2039, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'NameError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow_core\python\pywrap_tensorflow.py", line 58, in <module>
    from tensorflow.python.pywrap_tensorflow_internal import *
  File "C:\ProgramData\Anaconda3\lib\site-packages\ten

NameError: name 'Tokenizer' is not defined

In [16]:
tokenizer.word_index

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3325, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-16-f9979903851c>", line 1, in <module>
    tokenizer.word_index
NameError: name 'tokenizer' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2039, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'NameError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow_core\python\pywrap_tensorflow.py", line 58, in <module>
    from tensorflow.python.pywrap_tensorflow_internal import *
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensor

NameError: name 'tokenizer' is not defined

In [None]:
trainsequences = tokenizer.texts_to_sequences(X_train)
print(trainsequences)

In [None]:
X_train.shape

In [None]:
len(trainsequences)

In [None]:
trainsequences[0]

In [None]:
MAXLEN = 250

In [None]:
trainseqs = pad_sequences(trainsequences, maxlen=MAXLEN, padding='post')
print(trainseqs)

In [110]:
trainseqs.shape

(1668, 250)

In [111]:
testsequences = tokenizer.texts_to_sequences(X_test)
testseqs = pad_sequences(testsequences, maxlen=MAXLEN, padding='post')

In [112]:
print(testseqs)

[[ 529  252  275 ...    0    0    0]
 [ 141 2463  167 ...  108  525    1]
 [ 957 1324 4982 ...    0    0    0]
 ...
 [  14  273  861 ... 1358 1046  476]
 [ 874  536 1339 ...    0    0    0]
 [5604  323  277 ...  116 2714  488]]


In [27]:
trainseqs.shape

(1668, 250)

In [28]:
y_test.shape

(557, 5)

In [114]:
embedding_size = 32

In [115]:
op_units = df['category'].nunique()

In [116]:
# define the model
model = Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=MAXLEN))
model.add(Flatten())
model.add(Dense(op_units, activation='softmax'))

In [117]:
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [118]:
# summarize the model
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 250, 32)           581056    
_________________________________________________________________
flatten_1 (Flatten)          (None, 8000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 40005     
Total params: 621,061
Trainable params: 621,061
Non-trainable params: 0
_________________________________________________________________
None


In [120]:
plot_model(model, to_file='Embedding.png')

In [121]:
# fit the model
model.fit(trainseqs, 
          y_train, 
          epochs=5,
          validation_data=(testseqs,y_test),
          verbose=2)

Train on 1668 samples, validate on 557 samples
Epoch 1/5
 - 3s - loss: 1.4989 - acc: 0.3753 - val_loss: 1.3889 - val_acc: 0.4865
Epoch 2/5
 - 1s - loss: 0.9327 - acc: 0.8369 - val_loss: 0.7150 - val_acc: 0.8761
Epoch 3/5
 - 1s - loss: 0.2707 - acc: 0.9832 - val_loss: 0.2899 - val_acc: 0.9677
Epoch 4/5
 - 1s - loss: 0.0800 - acc: 0.9988 - val_loss: 0.1891 - val_acc: 0.9713
Epoch 5/5
 - 1s - loss: 0.0362 - acc: 1.0000 - val_loss: 0.1518 - val_acc: 0.9767


<keras.callbacks.History at 0x7f10911bff28>

In [122]:
# evaluate the model
loss, accuracy = model.evaluate(testseqs, y_test, verbose=2)
print('Loss: %f' % (loss))
print('Accuracy: %f' % (accuracy*100))

Loss: 0.151848
Accuracy: 97.666068


In [129]:
model.layers[0].get_weights()[0].shape

(18158, 32)

In [130]:
# Extract weights from the Embedding Layers
embeddings = model.layers[0].get_weights()[0]

# `embeddings` has a shape of (num_vocab, embedding_dim) 

# `word_to_index` is a mapping (i.e. dict) from words to 
# their index
words_embeddings = {w:embeddings[idx] for w, idx in tokenizer.word_index.items()}

In [46]:
words_embeddings['tiger']

array([ 0.05851227,  0.00641839, -0.02863956, -0.01390276, -0.01745366,
       -0.02456674, -0.0247659 , -0.04967704,  0.01430894,  0.00141364,
       -0.03670051,  0.0137229 , -0.0260178 , -0.05774996, -0.03162028,
       -0.01499818,  0.05789851,  0.04830869, -0.02433261,  0.0626341 ,
        0.05774857,  0.02380782,  0.01222595,  0.04541155,  0.00917102,
        0.03889833, -0.00480174,  0.05583175,  0.00934568,  0.03900613,
       -0.03565377,  0.0149274 ], dtype=float32)

## Pre-trained Embeddings

### Gensim Word2Vec Embeddings

In [135]:
news_corpus = df['text'].values.tolist()

In [136]:
len(df['text'])

2225

In [137]:
news_corpus = []
for i in range(len(df['text'])):
    news = df['text'][i].lower()
    news = news.split()
    news_corpus.append(news)

In [138]:
len(news_corpus)

2225

In [139]:
news_corpus[0]

['futur',
 'hand',
 'viewer',
 'home',
 'theatr',
 'system',
 'plasma',
 'high',
 '-',
 'definit',
 'tvs',
 'digit',
 'video',
 'record',
 'move',
 'live',
 'room',
 'way',
 'peopl',
 'watch',
 'radic',
 'differ',
 'five',
 'year',
 'time',
 'accord',
 'expert',
 'panel',
 'gather',
 'annual',
 'consum',
 'electron',
 'show',
 'las',
 'vega',
 'discuss',
 'new',
 'technolog',
 'impact',
 'one',
 'favourit',
 'pastim',
 'lead',
 'trend',
 'programm',
 'content',
 'deliv',
 'viewer',
 'via',
 'home',
 'network',
 'cabl',
 'satellit',
 'telecom',
 'compani',
 'broadband',
 'servic',
 'provid',
 'front',
 'room',
 'portabl',
 'devic',
 'one',
 'talk',
 '-',
 'about',
 'technolog',
 'ces',
 'digit',
 'person',
 'video',
 'record',
 'dvr',
 'pvr',
 'set',
 '-',
 'top',
 'box',
 'like',
 'tivo',
 'sky',
 '+',
 'system',
 'allow',
 'peopl',
 'record',
 'store',
 'play',
 'paus',
 'forward',
 'wind',
 'programm',
 'want',
 'essenti',
 'technolog',
 'allow',
 'much',
 'personalis',
 'tv',
 'also

In [4]:
# Word2Vec Data processing
from gensim.models import Word2Vec
model_w2v = Word2Vec(sentences=news_corpus, size=embedding_size, 
                window=5, workers=4, min_count=50)

NameError: name 'news_corpus' is not defined

In [142]:
model_w2v.wv.most_similar('futur')

[('problem', 0.8933914303779602),
 ('worri', 0.8912731409072876),
 ('possibl', 0.8881564140319824),
 ('particular', 0.8845880031585693),
 ('awar', 0.8747023940086365),
 ('potenti', 0.8711709976196289),
 ('integr', 0.8678703308105469),
 ('consid', 0.8651309609413147),
 ('longer', 0.8611962795257568),
 ('focus', 0.8564385175704956)]

In [143]:
words = list(model_w2v.wv.vocab)
print("Vocabulary size: %d"% len(words))

Vocabulary size: 1853


In [144]:
filename = 'news_w2v.txt'
model_w2v.wv.save_word2vec_format(filename, binary=False)

### Use pre-trained w2v as Embeddings

In [145]:
import os

embeddings_index_w2v = {}
f = open(os.path.join('','news_w2v.txt'), encoding='utf-8')

for line in f:
    values = line.split()
    word = values[0]
    coeffs = np.asarray(values[1:])
    embeddings_index_w2v[word] = coeffs
f.close()

In [147]:
len(embeddings_index_w2v['futur'])

32

In [148]:
# Vectorize
tokenizer = Tokenizer()
tokenizer.fit_on_texts(news_corpus)
seqs = tokenizer.texts_to_sequences(news_corpus)

In [149]:
# Pad sequences
word_index = tokenizer.word_index
print("Found %d unique tokens"% len(word_index))

news_pad = pad_sequences(seqs, maxlen = 50)
news_pad.shape

Found 20660 unique tokens


(2225, 50)

In [150]:
num_words = len(word_index) + 1
embedding_matrix_w2v = np.zeros((num_words, embedding_size))

for word, i in word_index.items():
    if i > num_words:
        continue
    embedding_vector = embeddings_index_w2v.get(word)
    if embedding_vector is not None:
        # words not found in embeddings_index_w2v will be all zeros
        embedding_matrix_w2v[i] = embedding_vector
print(num_words)    

20661


In [159]:
# define the model
model_pt_w2v = Sequential()
embedding_layer_pt_w2v = Embedding(num_words, 
                            embedding_size,
                            embeddings_initializer = Constant(embedding_matrix_w2v),
                            input_length=MAXLEN,
                            trainable=False)
model_pt_w2v.add(embedding_layer_pt_w2v)
model_pt_w2v.add(Flatten())
model_pt_w2v.add(Dense(op_units, activation='softmax'))

In [160]:
# compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [161]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 250, 32)           581056    
_________________________________________________________________
flatten_1 (Flatten)          (None, 8000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 40005     
Total params: 621,061
Trainable params: 621,061
Non-trainable params: 0
_________________________________________________________________
None


In [162]:
# fit the model
model.fit(trainseqs, 
          y_train, 
          batch_size=128, 
          epochs=50, 
          validation_data=(testseqs, y_test), 
          verbose=2)

Train on 1668 samples, validate on 557 samples
Epoch 1/50
 - 1s - loss: 0.0192 - acc: 1.0000 - val_loss: 0.1153 - val_acc: 0.9785
Epoch 2/50
 - 1s - loss: 0.0067 - acc: 1.0000 - val_loss: 0.0993 - val_acc: 0.9803
Epoch 3/50
 - 0s - loss: 0.0032 - acc: 1.0000 - val_loss: 0.0911 - val_acc: 0.9749
Epoch 4/50
 - 0s - loss: 0.0020 - acc: 1.0000 - val_loss: 0.0878 - val_acc: 0.9767
Epoch 5/50
 - 0s - loss: 0.0014 - acc: 1.0000 - val_loss: 0.0853 - val_acc: 0.9785
Epoch 6/50
 - 1s - loss: 0.0010 - acc: 1.0000 - val_loss: 0.0842 - val_acc: 0.9785
Epoch 7/50
 - 1s - loss: 8.4065e-04 - acc: 1.0000 - val_loss: 0.0833 - val_acc: 0.9785
Epoch 8/50
 - 1s - loss: 6.9079e-04 - acc: 1.0000 - val_loss: 0.0827 - val_acc: 0.9785
Epoch 9/50
 - 0s - loss: 5.8509e-04 - acc: 1.0000 - val_loss: 0.0822 - val_acc: 0.9785
Epoch 10/50
 - 1s - loss: 5.0056e-04 - acc: 1.0000 - val_loss: 0.0818 - val_acc: 0.9785
Epoch 11/50
 - 0s - loss: 4.3485e-04 - acc: 1.0000 - val_loss: 0.0812 - val_acc: 0.9785
Epoch 12/50
 - 1s 

<keras.callbacks.History at 0x7f1049183908>

# Keras Hyper Tunnning

In [1]:
import tensorflow as tf
import kerastuner as kt
print(tf.__version__)
print(kt.__version__)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow_core\python\pywrap_tensorflow.py", line 58, in <module>
    from tensorflow.python.pywrap_tensorflow_internal import *
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow_core\python\pywrap_tensorflow_internal.py", line 28, in <module>
    _pywrap_tensorflow_internal = swig_import_helper()
  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow_core\python\pywrap_tensorflow_internal.py", line 24, in swig_import_helper
    _mod = imp.load_module('_pywrap_tensorflow_internal', fp, pathname, description)
  File "C:\ProgramData\Anaconda3\lib\imp.py", line 242, in load_module
    return load_dynamic(name, filename, file)
  File "C:\ProgramData\Anaconda3\lib\imp.py", line 342, in load_dynamic
    return _load(spec)
ImportError: DLL load failed: The specified module could not be found.

During handling of the above exception, another exception occurred:

Traceback (most recent call

TypeError: can only concatenate str (not "list") to str