In [380]:
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.datasets import load_files
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import KeyedVectors
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences 
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SimpleRNN
from keras.layers import Dense, Dropout, Activation
from keras.layers import Conv1D, MaxPooling1D
from keras.utils.np_utils import to_categorical
from keras.models import Model
from keras.engine import Input

# NLTK experiments

In [2]:
text = 'NLP is a subfield of computer science and artificial intelligence. It is concerned with interactions between computers and human (natural) languages. It is used to apply machine learning algorithms to text and speech.'

In [3]:
print(text)

NLP is a subfield of computer science and artificial intelligence. It is concerned with interactions between computers and human (natural) languages. It is used to apply machine learning algorithms to text and speech.


In [4]:
#Sentence tokenization. Splitting the paragraph into sentences
sentences = sent_tokenize(text)
print(sentences)

['NLP is a subfield of computer science and artificial intelligence.', 'It is concerned with interactions between computers and human (natural) languages.', 'It is used to apply machine learning algorithms to text and speech.']


In [5]:
#Word tokenization. Splitting the sentences into words
words = word_tokenize(text)
print(words)

['NLP', 'is', 'a', 'subfield', 'of', 'computer', 'science', 'and', 'artificial', 'intelligence', '.', 'It', 'is', 'concerned', 'with', 'interactions', 'between', 'computers', 'and', 'human', '(', 'natural', ')', 'languages', '.', 'It', 'is', 'used', 'to', 'apply', 'machine', 'learning', 'algorithms', 'to', 'text', 'and', 'speech', '.']


In [6]:
#Sentence wise word tokenization 
tokenized_words = []

for i in range(len(sentences)):
    tokenized_words.append(word_tokenize(sentences[i]))

print(tokenized_words)

[['NLP', 'is', 'a', 'subfield', 'of', 'computer', 'science', 'and', 'artificial', 'intelligence', '.'], ['It', 'is', 'concerned', 'with', 'interactions', 'between', 'computers', 'and', 'human', '(', 'natural', ')', 'languages', '.'], ['It', 'is', 'used', 'to', 'apply', 'machine', 'learning', 'algorithms', 'to', 'text', 'and', 'speech', '.']]


In [7]:
#Stemming
stemmer = PorterStemmer()
stemmed_sent = []

for i in range(len(sentences)):
    words = word_tokenize(sentences[i])
    words = [stemmer.stem(word) for word in words]
    stemmed_sent.append(words) 

print(stemmed_sent)

[['nlp', 'is', 'a', 'subfield', 'of', 'comput', 'scienc', 'and', 'artifici', 'intellig', '.'], ['It', 'is', 'concern', 'with', 'interact', 'between', 'comput', 'and', 'human', '(', 'natur', ')', 'languag', '.'], ['It', 'is', 'use', 'to', 'appli', 'machin', 'learn', 'algorithm', 'to', 'text', 'and', 'speech', '.']]


In [8]:
#Lemmatization
lemmatizer = WordNetLemmatizer()
lemma_sent = []

for i in range(len(sentences)):
    words = word_tokenize(sentences[i])
    words = [lemmatizer.lemmatize(word) for word in words]
    lemma_sent.append(words)
    
print(lemma_sent)

[['NLP', 'is', 'a', 'subfield', 'of', 'computer', 'science', 'and', 'artificial', 'intelligence', '.'], ['It', 'is', 'concerned', 'with', 'interaction', 'between', 'computer', 'and', 'human', '(', 'natural', ')', 'language', '.'], ['It', 'is', 'used', 'to', 'apply', 'machine', 'learning', 'algorithm', 'to', 'text', 'and', 'speech', '.']]


In [9]:
#Print stop words in English
stops = stopwords.words("english")
print(stops)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

In [10]:
#Removing stop words
stop_words_removed = []

for i in range(len(sentences)):
    words = word_tokenize(sentences[i])
    words = [word for word in words if word not in stopwords.words('english')]
    stop_words_removed.append(words)
    
print(stop_words_removed)

[['NLP', 'subfield', 'computer', 'science', 'artificial', 'intelligence', '.'], ['It', 'concerned', 'interactions', 'computers', 'human', '(', 'natural', ')', 'languages', '.'], ['It', 'used', 'apply', 'machine', 'learning', 'algorithms', 'text', 'speech', '.']]


# Text classification
## Sentiment analysis

In [89]:
#Reading the data
data = pd.read_csv('sa_data.csv')

In [90]:
data.head()

Unnamed: 0,text,sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,Neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,Neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive


In [246]:
#Preprocessing - removing unwanted characters, tokenization, stop-word removal
def clean_data(tweet):
    x = re.sub("[^a-zA-Z0-9]", " ",tweet) 
    x = x.lower().split()                             
    stops = set(stopwords.words("english"))                  
    words = [w for w in x if not w in stops] 
    return( " ".join(words)) 

In [247]:
data['text'] = data['text'].apply(lambda x: clean_data(x))
print(data['text'])

0        nancyleegrahn everyone feel climate change que...
1        scottwalker catch full gopdebate last night sc...
2        tjmshow mention tamir rice gopdebate held clev...
3        robgeorge carly fiorina trending hours debate ...
4        danscavino gopdebate w realdonaldtrump deliver...
5        gregabbott tx tedcruz first day rescind every ...
6        warriorwoman91 liked happy heard going moderat...
7        going msnbc live thomasaroberts around 2 pm et...
8        deer headlights lizzwinstead ben carson may br...
9        nancyosborne180 last night debate proved gopde...
10       jgreendc realdonaldtrump fairness billclinton ...
11       waynedupreeshow woke tweet gopdebate best line...
12       reading family comments great gopdebate http c...
13       arcticfox2016 allenwestrepub dear jebbush gopd...
14       pattonoswalt loved scott walker mark harmon ro...
15       hey chrischristie exploiting tragedy 9 11 poli...
16       carolcnn donaldtrump fire comments women peter.

In [248]:
for i in range(len(data)):
    data['text'][i] = data['text'][i].replace('rt ','')

print(data['text'])

0        nancyleegrahn everyone feel climate change que...
1        scottwalker catch full gopdebate last night sc...
2        tjmshow mention tamir rice gopdebate held clev...
3        robgeorge carly fiorina trending hours debate ...
4        danscavino gopdebate w realdonaldtrump deliver...
5        gregabbott tx tedcruz first day rescind every ...
6        warriorwoman91 liked happy heard going moderat...
7        going msnbc live thomasaroberts around 2 pm et...
8        deer headlights lizzwinstead ben carson may br...
9        nancyosborne180 last night debate proved gopde...
10       jgreendc realdonaldtrump fairness billclinton ...
11       waynedupreeshow woke tweet gopdebate best line...
12       reading family comments great gopdebate http c...
13       arcticfox2016 allenwestrepub dear jebbush gopd...
14       pattonoswalt loved scott walker mark harmon ro...
15       hey chrischristie exploiting tragedy 9 11 poli...
16       carolcnn donaldtrump fire comments women peter.

In [249]:
#Converting the dataframe into list
reviews = data['text'].tolist()
sentiment = data['sentiment'].tolist()

In [250]:
corpus = []
for i in range(len(reviews)):
    corpus.append(word_tokenize(reviews[i]))

In [379]:
corpus

[['nancyleegrahn',
  'everyone',
  'feel',
  'climate',
  'change',
  'question',
  'last',
  'night',
  'exactly',
  'gopdebate'],
 ['scottwalker',
  'catch',
  'full',
  'gopdebate',
  'last',
  'night',
  'scott',
  'best',
  'lines',
  '90',
  'seconds',
  'walker16',
  'http',
  'co',
  'zsff'],
 ['tjmshow',
  'mention',
  'tamir',
  'rice',
  'gopdebate',
  'held',
  'cleveland',
  'wow'],
 ['robgeorge',
  'carly',
  'fiorina',
  'trending',
  'hours',
  'debate',
  'men',
  'completed',
  'gopdebate',
  'says'],
 ['danscavino',
  'gopdebate',
  'w',
  'realdonaldtrump',
  'delivered',
  'highest',
  'ratings',
  'history',
  'presidential',
  'debates',
  'trump2016',
  'http',
  'co'],
 ['gregabbott',
  'tx',
  'tedcruz',
  'first',
  'day',
  'rescind',
  'every',
  'illegal',
  'executive',
  'action',
  'taken',
  'barack',
  'obama',
  'gopdebate',
  'foxnews'],
 ['warriorwoman91',
  'liked',
  'happy',
  'heard',
  'going',
  'moderator',
  'anymore',
  'gopdebate',
  'meg

### Feature extraction using word2vec

In [381]:
#Creating word embedding for the words. Embedding dimension = 100
model = Word2Vec(corpus, size=100, window=5, min_count=1)

In [252]:
#Voabulary list
model.wv.vocab

{'hear': <gensim.models.keyedvectors.Vocab at 0x7f31959757f0>,
 'immigration': <gensim.models.keyedvectors.Vocab at 0x7f3195975c88>,
 '14th': <gensim.models.keyedvectors.Vocab at 0x7f3195975550>,
 'tcpalm': <gensim.models.keyedvectors.Vocab at 0x7f31950a3be0>,
 'newlow': <gensim.models.keyedvectors.Vocab at 0x7f3195975ba8>,
 'dinner': <gensim.models.keyedvectors.Vocab at 0x7f319523e780>,
 'socialmedia': <gensim.models.keyedvectors.Vocab at 0x7f319523e978>,
 'teeth': <gensim.models.keyedvectors.Vocab at 0x7f319523e240>,
 'heels': <gensim.models.keyedvectors.Vocab at 0x7f319523e9b0>,
 'annemarieweers': <gensim.models.keyedvectors.Vocab at 0x7f319523eb38>,
 'newsnation': <gensim.models.keyedvectors.Vocab at 0x7f3195241080>,
 'please': <gensim.models.keyedvectors.Vocab at 0x7f3195241208>,
 'elianabenador': <gensim.models.keyedvectors.Vocab at 0x7f319524af28>,
 'messenger': <gensim.models.keyedvectors.Vocab at 0x7f31952412e8>,
 'sfrnc': <gensim.models.keyedvectors.Vocab at 0x7f3195241320>,


In [253]:
#Word embedding of the word "With"
model.wv["level"]

array([-0.01682623,  0.01904772,  0.09865578, -0.00305726,  0.01429111,
        0.02463139,  0.02655845, -0.03346057, -0.02406447,  0.00259166,
        0.01031249,  0.0039704 ,  0.02156182, -0.05577968, -0.02400477,
        0.0123924 , -0.00462562,  0.03068833,  0.03928994,  0.05355615,
        0.01525774,  0.06060887, -0.06233357,  0.00382755,  0.05642894,
       -0.02394897, -0.07360403, -0.00662334,  0.00678974, -0.05055292,
       -0.02043871,  0.00939632,  0.03730861,  0.02511507,  0.06233795,
        0.06402127,  0.01263601, -0.02919376,  0.06216326,  0.04464545,
        0.02089566,  0.02613247, -0.02415999,  0.03567228, -0.05724327,
        0.00347368,  0.00499592, -0.03593763,  0.06694987, -0.04172412,
        0.01733824,  0.00712488, -0.01386341, -0.03779038,  0.04002213,
        0.0044485 ,  0.04796574,  0.02166961, -0.03609928,  0.03276599,
       -0.02174158, -0.03666539,  0.01866267,  0.02628101,  0.02463651,
        0.05440414,  0.0123358 ,  0.03987351,  0.00765493, -0.01

In [305]:
#Simialr words of music in the corpus
model.wv.most_similar("level")

[('brain', 0.9965094327926636),
 ('scott', 0.99632728099823),
 ('brothers', 0.9959276914596558),
 ('rather', 0.9958280324935913),
 ('idiot', 0.9958206415176392),
 ('willing', 0.9957659244537354),
 ('b', 0.9957613945007324),
 ('twist', 0.9957426190376282),
 ('hates', 0.9957401156425476),
 ('wanted', 0.9956766366958618)]

In [307]:
model.wv.most_similar(positive=['king','woman'], negative= ['man'], topn=5)

[('doubts', 0.9784618616104126),
 ('one', 0.9778905510902405),
 ('girls', 0.9778863191604614),
 ('pants', 0.9751665592193604),
 ('hits', 0.9745514988899231)]

In [308]:
model.wv.doesnt_match(['woman','man','queen','movie'])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'man'

In [255]:
#Creating the input data
X = np.zeros((len(corpus),100)) #Initializing the X matrix with zeros
for i in range(len(corpus)):
    emb = [model.wv[w] for w in corpus[i]] #Create a list of word embeddings of the words in each sentence
    X[i] = np.mean(emb, axis=0) #Take the mean of the word embeddings of the words in a sentence because length of the sentences varies and the dimension of the features will increase with the increase in the number of words in the sentence

In [256]:
le = preprocessing.LabelEncoder()
Y = le.fit_transform(sentiment)

In [257]:
print(X.shape,Y.shape)

(13871, 100) (13871,)


In [258]:
#Splitting the data into train data and test data
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(11096, 100) (9709,)
(2775, 100) (4162,)


In [259]:
#Printing the size of the train data, train label, test data and test label
print("Shape train data = ",np.shape(X_train))
print("Shape of train label = ",np.shape(y_train))
print("Shape of test data = ",np.shape(X_test))
print("Shape of test label = ",np.shape(y_test))

Shape train data =  (11096, 100)
Shape of train label =  (11096,)
Shape of test data =  (2775, 100)
Shape of test label =  (2775,)


In [260]:
#Classification using SVM
clf = GaussianNB() #Initializing the classifier
clf.fit(X_train, y_train) #Train the classifier using X_train and y_train

GaussianNB(priors=None, var_smoothing=1e-09)

In [261]:
#Predict the class label for the test data using the trained model
y_pred = clf.predict(X_test)

In [262]:
#Generating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[ 414 1094  214]
 [  55  475   82]
 [  38  301  102]]


In [263]:
#Computation of the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy = ','%.2f'%(accuracy*100))

Accuracy =  35.71


In [264]:
#K-fold cross-validation
score = cross_val_score(clf, X, y, cv = 5)

In [265]:
#Performance of the model evaluated using accuracy score. k-fold cross-validation gives k different accuracy score. 
#Hence the accuracy of prediction by the model is taken as the mean of the accuracy score +/- standard deviation of the score
print('Accuracy = ','%.2f'%(np.mean(score)*100),"+/-",'%.4f'%(np.std(score)))

Accuracy =  31.07 +/- 0.0651


### Feature extraction using Doc2vec

In [266]:
#Creating sentence embedding with Doc2vec
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(corpus)]
model = Doc2Vec(documents, size = 100, window=5, min_count=1)

In [267]:
#Create the input matrix using doc2vec embeddings
X = []

for i in range(len(corpus)):
    X.append(model.infer_vector(corpus[i]))

In [268]:
#Splitting the data into train data and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [269]:
#Printing the size of the train data, train label, test data and test label
print("Shape train data = ",np.shape(X_train))
print("Shape of train label = ",np.shape(y_train))
print("Shape of test data = ",np.shape(X_test))
print("Shape of test label = ",np.shape(y_test))

Shape train data =  (11096, 100)
Shape of train label =  (11096,)
Shape of test data =  (2775, 100)
Shape of test label =  (2775,)


In [270]:
#Classification using SVM
clf = GaussianNB() #Initializing the classifier
clf.fit(X_train, y_train) #Train the classifier using X_train and y_train

GaussianNB(priors=None, var_smoothing=1e-09)

In [271]:
#Predict the class label for the test data using the trained model
y_pred = clf.predict(X_test)

In [272]:
#Generating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[689 473 542]
 [172 255 206]
 [119 113 206]]


In [273]:
#Computation of the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy = ','%.2f'%(accuracy*100))

Accuracy =  41.44


In [274]:
#K-fold cross-validation
score = cross_val_score(clf, X, sentiment, cv = 5)

In [275]:
#Performance of the model evaluated using accuracy score. k-fold cross-validation gives k different accuracy score. 
#Hence the accuracy of prediction by the model is taken as the mean of the accuracy score +/- standard deviation of the score
print('Accuracy = ','%.2f'%(np.mean(score)*100),"+/-",'%.4f'%(np.std(score)))

Accuracy =  36.34 +/- 0.0328


### Feature extraction using pretrained models

In [38]:
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
model.wv.most_similar(positive=['king','woman'], negative= ['man'], topn=1)

## Text classification using deep learning algorithms

In [276]:
data = pd.read_csv('sa_data.csv')

In [277]:
data.head()

Unnamed: 0,text,sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,Neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,Neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive


In [278]:
#Preprocessing - removing unwanted characters, tokenization, stop-word removal
def clean_data(tweet):
    x = re.sub("[^a-zA-Z0-9]", " ",tweet) 
    x = x.lower().split()                             
    stops = set(stopwords.words("english"))                  
    words = [w for w in x if not w in stops] 
    return( " ".join(words)) 

In [279]:
data['text'] = data['text'].apply(lambda x: clean_data(x))

In [280]:
for i in range(len(data)):
    data['text'][i] = data['text'][i].replace('rt ','')

In [282]:
#Converting the dataframe into list
reviews = data['text'].tolist()
sentiment = data['sentiment'].tolist()

corpus = []
for i in range(len(reviews)):
    corpus.append(word_tokenize(reviews[i]))

In [283]:
#Tokenization
max_features = 1200 #number of words to keep. 1200 is the number of unique words in the corpus.
tokenizer = Tokenizer(nb_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)



In [284]:
#Unique words and and their count
tokenizer.word_counts

OrderedDict([('nancyleegrahn', 6),
             ('everyone', 109),
             ('feel', 62),
             ('climate', 72),
             ('change', 84),
             ('question', 351),
             ('last', 879),
             ('night', 903),
             ('exactly', 22),
             ('gopdebate', 9048),
             ('scottwalker', 94),
             ('catch', 9),
             ('full', 45),
             ('scott', 101),
             ('best', 185),
             ('lines', 14),
             ('90', 12),
             ('seconds', 16),
             ('walker16', 29),
             ('http', 3018),
             ('co', 3633),
             ('zsff', 1),
             ('tjmshow', 1),
             ('mention', 57),
             ('tamir', 2),
             ('rice', 2),
             ('held', 10),
             ('cleveland', 37),
             ('wow', 55),
             ('robgeorge', 1),
             ('carly', 120),
             ('fiorina', 144),
             ('trending', 2),
             ('hours', 34),
       

In [285]:
#Dictionary index of a word
tokenizer.word_index["moderators"]

185

In [286]:
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X, padding = 'post') #Zero padding at the end of the sequence

In [287]:
Y = to_categorical(Y)
print(Y)

[[0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 ...
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]]


In [288]:
#Splitting the data into train data and test data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

### RNN, LSTM, Stacked-RNN, Stacked-LSTM, Bidirectional-RNN, Bidirectional-LSTM

In [289]:
embed_dim = 500
hidden_layer = 100

model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X.shape[1], dropout=0.2))
model.add(SimpleRNN(hidden_layer))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 24, 500)           600000    
_________________________________________________________________
simple_rnn_6 (SimpleRNN)     (None, 100)               60100     
_________________________________________________________________
dense_7 (Dense)              (None, 3)                 303       
Total params: 660,403
Trainable params: 660,403
Non-trainable params: 0
_________________________________________________________________
None


  """


In [290]:
batch_size = 32
model.fit(X_train, y_train, epochs = 10, batch_size=batch_size)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f319502d860>

In [291]:
score = model.evaluate(X_test, y_test, verbose = 1, batch_size = batch_size)
print("Accuracy: %.2f" % (score[1]*100))

Accuracy: 62.81


In [292]:
test = data['text'][0]
test = clean_data(test)
test = test.replace('rt ','')
test = [test]
test = tokenizer.texts_to_sequences(test)
test = pad_sequences(test, maxlen=24, padding = 'post')

In [293]:
class_label = model.predict_classes(test)
print(le.inverse_transform(class_label))

['Negative']


### CNN-RNN and CNN-LSTM

In [294]:
# Convolution
kernel_size = 5
filters = 64
pool_size = 4

In [299]:
model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X.shape[1], dropout=0.2))
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(LSTM(hidden_layer))
model.add(Dense(3))
model.add(Activation('sigmoid'))

  


In [300]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [301]:
batch_size = 32
model.fit(X_train, y_train, epochs = 10, batch_size=batch_size)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3194fa2908>

In [365]:
score = model.evaluate(X_test, y_test, verbose = 1, batch_size = batch_size)
print("Accuracy: %.2f" % (score[1]*100))

Accuracy: 60.76


### Using word2vec in Keras 

In [309]:
#Reading the data
data = pd.read_csv('sa_data.csv')

In [310]:
def clean_data(tweet):
    x = re.sub("[^a-zA-Z0-9]", " ",tweet) 
    x = x.lower().split()                             
    stops = set(stopwords.words("english"))                  
    words = [w for w in x if not w in stops] 
    return( " ".join(words)) 

In [311]:
data['text'] = data['text'].apply(lambda x: clean_data(x))

for i in range(len(data)):
    data['text'][i] = data['text'][i].replace('rt ','')

In [312]:
reviews = data['text'].tolist()
sentiment = data['sentiment'].tolist()

corpus = []
for i in range(len(reviews)):
    corpus.append(word_tokenize(reviews[i]))

In [317]:
wv_model = Word2Vec(corpus, size=100, window=5, min_count=1)
w2v_model_wv = wv_model.wv

In [356]:
#Tokenization
max_features = 1200 #number of words to keep. 1200 is the number of unique words in the corpus.
tokenizer = Tokenizer(nb_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X, padding = 'post') #Zero padding at the end of the sequence
word_index = tokenizer.word_index



In [358]:
maxlen = len(X[0])

In [359]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
print('Shape of X_train and X_test data:', X_train.shape,X_test.shape)
print('Shape of y_train and y_test:', y_train.shape,y_test.shape)

Shape of X_train and X_test data: (11096, 24) (2775, 24)
Shape of y_train and y_test: (11096, 3) (2775, 3)


In [337]:
nb_words = len(wv_model.wv.vocab)
embedding_dimension = 100

embedding_matrix = np.zeros((nb_words, embedding_dimension))
for word, i in word_index.items():
    if word in wv_model.wv.vocab:
        embedding_matrix[i] = wv_model.wv[word]

In [373]:
wv_layer = Embedding(nb_words,embedding_dimension,weights=[embedding_matrix], input_length=maxlen,
                     trainable=True)
input_tweet = Input(shape=(maxlen,))
embedded_sequences = wv_layer(input_tweet)
x = SimpleRNN(64, return_sequences=False)(embedded_sequences)
preds = Dense(3, activation='sigmoid')(x)

In [374]:
model = Model(inputs=[input_tweet], outputs=preds)

In [375]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [376]:
model.fit([X_train], y_train, validation_split=0.1,epochs=10, batch_size=256, shuffle=True)

Train on 9986 samples, validate on 1110 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f31945d6cc0>

In [377]:
score = model.evaluate(X_test, y_test, verbose = 2, batch_size = batch_size)
print("Accuracy: %.2f" % (score[1]*100))

Accuracy: 61.05
