In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.datasets import load_files
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import KeyedVectors
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences 
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SimpleRNN
from keras.layers import Dense, Dropout, Activation
from keras.layers import Conv1D, MaxPooling1D
from keras.utils.np_utils import to_categorical
from keras.models import Model
from tensorflow.keras.layers import InputSpec

# NLTK experiments

In [4]:
text = 'NLP is a subfield of computer science and artificial intelligence. It is concerned with interactions between computers and human (natural) languages. It is used to apply machine learning algorithms to text and speech.'

In [5]:
print(text)

NLP is a subfield of computer science and artificial intelligence. It is concerned with interactions between computers and human (natural) languages. It is used to apply machine learning algorithms to text and speech.


In [6]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
#Sentence tokenization. Splitting the paragraph into sentences
sentences = sent_tokenize(text)
print(sentences)

['NLP is a subfield of computer science and artificial intelligence.', 'It is concerned with interactions between computers and human (natural) languages.', 'It is used to apply machine learning algorithms to text and speech.']


In [8]:
#Word tokenization. Splitting the sentences into words
words = word_tokenize(text)
print(words)

['NLP', 'is', 'a', 'subfield', 'of', 'computer', 'science', 'and', 'artificial', 'intelligence', '.', 'It', 'is', 'concerned', 'with', 'interactions', 'between', 'computers', 'and', 'human', '(', 'natural', ')', 'languages', '.', 'It', 'is', 'used', 'to', 'apply', 'machine', 'learning', 'algorithms', 'to', 'text', 'and', 'speech', '.']


In [9]:
#Sentence wise word tokenization 
tokenized_words = []

for i in range(len(sentences)):
    tokenized_words.append(word_tokenize(sentences[i]))

print(tokenized_words)

[['NLP', 'is', 'a', 'subfield', 'of', 'computer', 'science', 'and', 'artificial', 'intelligence', '.'], ['It', 'is', 'concerned', 'with', 'interactions', 'between', 'computers', 'and', 'human', '(', 'natural', ')', 'languages', '.'], ['It', 'is', 'used', 'to', 'apply', 'machine', 'learning', 'algorithms', 'to', 'text', 'and', 'speech', '.']]


In [10]:
#Stemming
stemmer = PorterStemmer()
stemmed_sent = []

for i in range(len(sentences)):
    words = word_tokenize(sentences[i])
    words = [stemmer.stem(word) for word in words]
    stemmed_sent.append(words) 

print(stemmed_sent)

[['nlp', 'is', 'a', 'subfield', 'of', 'comput', 'scienc', 'and', 'artifici', 'intellig', '.'], ['It', 'is', 'concern', 'with', 'interact', 'between', 'comput', 'and', 'human', '(', 'natur', ')', 'languag', '.'], ['It', 'is', 'use', 'to', 'appli', 'machin', 'learn', 'algorithm', 'to', 'text', 'and', 'speech', '.']]


In [11]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [12]:
#Lemmatization
lemmatizer = WordNetLemmatizer()
lemma_sent = []

for i in range(len(sentences)):
    words = word_tokenize(sentences[i])
    words = [lemmatizer.lemmatize(word) for word in words]
    lemma_sent.append(words)
    
print(lemma_sent)

[['NLP', 'is', 'a', 'subfield', 'of', 'computer', 'science', 'and', 'artificial', 'intelligence', '.'], ['It', 'is', 'concerned', 'with', 'interaction', 'between', 'computer', 'and', 'human', '(', 'natural', ')', 'language', '.'], ['It', 'is', 'used', 'to', 'apply', 'machine', 'learning', 'algorithm', 'to', 'text', 'and', 'speech', '.']]


In [13]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
#Print stop words in English
stops = stopwords.words("english")
print(stops)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [15]:
#Removing stop words
stop_words_removed = []

for i in range(len(sentences)):
    words = word_tokenize(sentences[i])
    words = [word for word in words if word not in stopwords.words('english')]
    stop_words_removed.append(words)
    
print(stop_words_removed)

[['NLP', 'subfield', 'computer', 'science', 'artificial', 'intelligence', '.'], ['It', 'concerned', 'interactions', 'computers', 'human', '(', 'natural', ')', 'languages', '.'], ['It', 'used', 'apply', 'machine', 'learning', 'algorithms', 'text', 'speech', '.']]


# Text classification
## Emotion classification


In [17]:
#Reading the data
data = pd.read_csv('/content/drive/MyDrive/codemixed.csv')

In [18]:
data.head()

Unnamed: 0,text,label
0,Amar sosur barir iftari toh tai jnhei .,Neutral
1,"Sir , Boos 2 Hit Movie Hobe . EID ar sera Movie",Joy
2,@ RanaSarkar @ KGunedited @ idevadhikari @ su...,Anger
3,Bristy Chuye Tahsan Moutushi Etota Valobashi ...,Joy
4,jeet30 Dada tumi kolkatai back korecho naki a...,Fear


In [19]:
#Preprocessing - removing unwanted characters, tokenization, stop-word removal
def clean_data(tweet):
    x = re.sub("[^a-zA-Z0-9]", " ",tweet) 
    x = x.lower().split()                             
    stops = set(stopwords.words("english"))                  
    words = [w for w in x if not w in stops] 
    return( " ".join(words)) 

In [20]:
data['text'] = data['text'].apply(lambda x: clean_data(x))
print(data['text'])

0                  amar sosur barir iftari toh tai jnhei
1            sir boos 2 hit movie hobe eid ar sera movie
2      ranasarkar kgunedited idevadhikari subhashrees...
3      bristy chuye tahsan moutushi etota valobashi a...
4      jeet30 dada tumi kolkatai back korecho naki ak...
                             ...                        
395    amropalii parshki neha amropalii ame na nach t...
396    yourkoel cockpit er shoot er kichu chobi dekhl...
397    jodio awkwardly dekhano film ota exlude korleo...
398    idevadhikari indraroy jeet30 da apnake 23 tari...
399    arnabbj sohini 6 aarey wah ki kore shunbo bolo...
Name: text, Length: 400, dtype: object


In [21]:
for i in range(len(data)):
    data['text'][i] = data['text'][i].replace('rt ','')

print(data['text'])

0                  amar sosur barir iftari toh tai jnhei
1            sir boos 2 hit movie hobe eid ar sera movie
2      ranasarkar kgunedited idevadhikari subhashrees...
3      bristy chuye tahsan moutushi etota valobashi a...
4      jeet30 dada tumi kolkatai back korecho naki ak...
                             ...                        
395    amropalii parshki neha amropalii ame na nach t...
396    yourkoel cockpit er shoot er kichu chobi dekhl...
397    jodio awkwardly dekhano film ota exlude korleo...
398    idevadhikari indraroy jeet30 da apnake 23 tari...
399    arnabbj sohini 6 aarey wah ki kore shunbo bolo...
Name: text, Length: 400, dtype: object


In [22]:
#Converting the dataframe into list
reviews = data['text'].tolist()
label = data['label'].tolist()

In [23]:
corpus = []
for i in range(len(reviews)):
    corpus.append(word_tokenize(reviews[i]))

In [24]:
corpus

[['amar', 'sosur', 'barir', 'iftari', 'toh', 'tai', 'jnhei'],
 ['sir', 'boos', '2', 'hit', 'movie', 'hobe', 'eid', 'ar', 'sera', 'movie'],
 ['ranasarkar',
  'kgunedited',
  'idevadhikari',
  'subhashreesotwe',
  'aroyfloyd',
  'dev',
  'neyni',
  'mane',
  'apnie',
  'movie',
  'ta',
  'atke',
  'rekhesen'],
 ['bristy',
  'chuye',
  'tahsan',
  'moutushi',
  'etota',
  'valobashi',
  'apurbo',
  'ishika',
  'khan',
  'bang',
  '2017',
  'https',
  'co',
  'jalp0rv8oz',
  'via',
  'youtub'],
 ['jeet30',
  'dada',
  'tumi',
  'kolkatai',
  'back',
  'korecho',
  'naki',
  'akhno',
  'b10',
  'bd',
  'ta',
  'achoo',
  'khub',
  'bhoi',
  'lagche',
  'plz',
  'bd',
  'takhe',
  'kichu',
  'pic',
  'update',
  'share',
  'koro',
  'plzz'],
 ['ranasarkar',
  'kgunedited',
  'idevadhikari',
  'subhashreesotwe',
  'aroyfloyd',
  'ek',
  'nijer',
  'parishromik',
  'nebe',
  'na',
  'bl6e',
  'plus',
  'apn',
  'https',
  'co',
  'ihf71qsbiu'],
 ['ticket', 'links', 'https', 'co', 'v6hgzjtpet',

### Feature extraction using word2vec

In [25]:
#Creating word embedding for the words. Embedding dimension = 100
model = Word2Vec(corpus, size=100, window=5, min_count=1)

In [26]:
#Voabulary list
model.wv.vocab

{'amar': <gensim.models.keyedvectors.Vocab at 0x7f98addf2f50>,
 'sosur': <gensim.models.keyedvectors.Vocab at 0x7f98addf2d10>,
 'barir': <gensim.models.keyedvectors.Vocab at 0x7f98addf2b90>,
 'iftari': <gensim.models.keyedvectors.Vocab at 0x7f98addf2ad0>,
 'toh': <gensim.models.keyedvectors.Vocab at 0x7f98addf2bd0>,
 'tai': <gensim.models.keyedvectors.Vocab at 0x7f98addf2cd0>,
 'jnhei': <gensim.models.keyedvectors.Vocab at 0x7f98addf2990>,
 'sir': <gensim.models.keyedvectors.Vocab at 0x7f98addf2fd0>,
 'boos': <gensim.models.keyedvectors.Vocab at 0x7f98addf2c50>,
 '2': <gensim.models.keyedvectors.Vocab at 0x7f98addf2c10>,
 'hit': <gensim.models.keyedvectors.Vocab at 0x7f98addf2e50>,
 'movie': <gensim.models.keyedvectors.Vocab at 0x7f98addf2f10>,
 'hobe': <gensim.models.keyedvectors.Vocab at 0x7f98ade79690>,
 'eid': <gensim.models.keyedvectors.Vocab at 0x7f98ade79f90>,
 'ar': <gensim.models.keyedvectors.Vocab at 0x7f98ade79dd0>,
 'sera': <gensim.models.keyedvectors.Vocab at 0x7f98ade79f5

In [27]:
#Word embedding of the word "bhalo"
model.wv["bhalo"]

array([-0.00377821,  0.00398831, -0.00034675,  0.004418  ,  0.00024207,
        0.0009351 , -0.00158245,  0.00243745,  0.00385543, -0.00196573,
       -0.00249629, -0.00082093,  0.00026986,  0.00253721, -0.0027127 ,
       -0.00167855,  0.00102565,  0.0031331 ,  0.00252158, -0.00406109,
       -0.00403888, -0.00241389,  0.00085218, -0.00486611, -0.00144275,
       -0.00387883, -0.00144721,  0.00015382, -0.00392323,  0.00296857,
        0.00445768, -0.00467528, -0.00218391,  0.00273682, -0.00012506,
       -0.00494177, -0.00227735, -0.00163192,  0.0009609 ,  0.00292502,
        0.00393708, -0.00546125, -0.00418367,  0.00177047,  0.00452792,
       -0.00010029,  0.00274256, -0.00405453,  0.00395881, -0.00397524,
        0.00315231,  0.0028222 ,  0.00347807,  0.00269855,  0.0014558 ,
        0.00152451,  0.00562543, -0.00442545,  0.00140901,  0.00033029,
        0.0003311 , -0.00446152,  0.00322552,  0.00234826,  0.0025297 ,
        0.00352295, -0.00434191,  0.00017922,  0.00510348, -0.00

In [28]:
#Simialr words of music in the corpus
model.wv.most_similar("bhalo")

[('matir', 0.31512895226478577),
 ('parini', 0.2989867329597473),
 ('gese', 0.29835328459739685),
 ('graphic', 0.2936605215072632),
 ('rated', 0.2899705767631531),
 ('huh', 0.28864219784736633),
 ('ammu', 0.28734999895095825),
 ('jate', 0.2767528295516968),
 ('jacci', 0.26113414764404297),
 ('firbe', 0.2596926689147949)]

In [29]:
model.wv.most_similar(positive=['film'], negative= ['kharap'], topn=5)

[('thako', 0.3644176125526428),
 ('mushfiqur15', 0.3283873200416565),
 ('bujhlam', 0.3169930875301361),
 ('hlw', 0.30735498666763306),
 ('moment', 0.3021945357322693)]

In [30]:
model.wv.doesnt_match(['opoman','koshto','mon','movie'])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'movie'

In [37]:
#Creating the input data
X = np.zeros((len(corpus),100)) #Initializing the X matrix with zeros
for i in range(len(corpus)):
    emb = [model.wv[w] for w in corpus[i]] #Create a list of word embeddings of the words in each sentence
    X[i] = np.mean(emb, axis=0) #Take the mean of the word embeddings of the words in a sentence because length of the sentences varies and the dimension of the features will increase with the increase in the number of words in the sentence

In [39]:
le = preprocessing.LabelEncoder()
Y = le.fit_transform(label)

In [43]:
print(X.shape, Y.shape)

(400, 100) (400,)


In [47]:
#Splitting the data into train data and test data 
X_train,y_train, X_test, y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(320, 100) (80, 100)
(320,) (80,)


In [None]:
#Printing the size of the train data, train label, test data and test label
print("Shape train data = ",np.shape(X_train))
print("Shape of train label = ",np.shape(y_train))
print("Shape of test data = ",np.shape(X_test))
print("Shape of test label = ",np.shape(y_test))

Shape train data =  (320, 100)
Shape of train label =  (80, 100)
Shape of test data =  (320,)
Shape of test label =  (80,)


In [None]:
#Classification using SVM
clf = GaussianNB() #Initializing the classifier
clf.fit(X_train, y_train) #Train the classifier using X_train and y_train

GaussianNB(priors=None, var_smoothing=1e-09)

In [None]:
#Predict the class label for the test data using the trained model
y_pred = clf.predict(X_test)

In [None]:
#Generating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[ 1  0  0  3  1  1  0]
 [ 2  3  0  7  2  1  0]
 [ 0  0  0  1  0  0  0]
 [ 0  5  0 16  6  4  1]
 [ 0  2  0  6  0  1  1]
 [ 0  1  0  7  2  0  1]
 [ 0  1  0  3  0  1  0]]


In [None]:
#Computation of the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy = ','%.2f'%(accuracy*100))

Accuracy =  25.00


In [None]:
#K-fold cross-validation
score = cross_val_score(clf, X, Y, cv = 5)



In [None]:
#Performance of the model evaluated using accuracy score. k-fold cross-validation gives k different accuracy score. 
#Hence the accuracy of prediction by the model is taken as the mean of the accuracy score +/- standard deviation of the score
print('Accuracy = ','%.2f'%(np.mean(score)*100),"+/-",'%.4f'%(np.std(score)))

Accuracy =  24.75 +/- 0.0215


### Feature extraction using Doc2vec

In [None]:
#Creating sentence embedding with Doc2vec
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(corpus)]
model = Doc2Vec(documents, size = 100, window=5, min_count=1)



In [None]:
#Create the input matrix using doc2vec embeddings
X = []

for i in range(len(corpus)):
    X.append(model.infer_vector(corpus[i]))

In [None]:
#Splitting the data into train data and test data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
#Printing the size of the train data, train label, test data and test label
print("Shape train data = ",np.shape(X_train))
print("Shape of train label = ",np.shape(y_train))
print("Shape of test data = ",np.shape(X_test))
print("Shape of test label = ",np.shape(y_test))

Shape train data =  (320, 100)
Shape of train label =  (320,)
Shape of test data =  (80, 100)
Shape of test label =  (80,)


In [None]:
#Classification using SVM
clf = GaussianNB() #Initializing the classifier
clf.fit(X_train, y_train) #Train the classifier using X_train and y_train

GaussianNB(priors=None, var_smoothing=1e-09)

In [None]:
#Predict the class label for the test data using the trained model
y_pred = clf.predict(X_test)

In [None]:
#Generating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[ 1  0  0  3  1  1  0]
 [ 2  3  0  7  2  1  0]
 [ 0  0  0  1  0  0  0]
 [ 0  5  0 16  6  4  1]
 [ 0  2  0  6  0  1  1]
 [ 0  1  0  7  2  0  1]
 [ 0  1  0  3  0  1  0]]


In [None]:
#Computation of the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy = ','%.2f'%(accuracy*100))

Accuracy =  25.00


In [None]:
#K-fold cross-validation
score = cross_val_score(clf, X, label, cv = 5)



In [None]:
#Performance of the model evaluated using accuracy score. k-fold cross-validation gives k different accuracy score. 
#Hence the accuracy of prediction by the model is taken as the mean of the accuracy score +/- standard deviation of the score
print('Accuracy = ','%.2f'%(np.mean(score)*100),"+/-",'%.4f'%(np.std(score)))

Accuracy =  24.75 +/- 0.0215


### Feature extraction using pretrained models


In [None]:
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

FileNotFoundError: ignored

In [None]:
model.wv.most_similar(positive=['king','woman'], negative= ['man'], topn=1)

## Text classification using deep learning algorithms

In [None]:
data = pd.read_csv('/content/drive/MyDrive/codemixed.csv')

In [None]:
data.head()

Unnamed: 0,text,label
0,Amar sosur barir iftari toh tai jnhei .,Neutral
1,"Sir , Boos 2 Hit Movie Hobe . EID ar sera Movie",Joy
2,@ RanaSarkar @ KGunedited @ idevadhikari @ su...,Anger
3,Bristy Chuye Tahsan Moutushi Etota Valobashi ...,Joy
4,jeet30 Dada tumi kolkatai back korecho naki a...,Fear


In [None]:
#Preprocessing - removing unwanted characters, tokenization, stop-word removal
def clean_data(tweet):
    x = re.sub("[^a-zA-Z0-9]", " ",tweet) 
    x = x.lower().split()                             
    stops = set(stopwords.words("english"))                  
    words = [w for w in x if not w in stops] 
    return( " ".join(words)) 

In [None]:
data['text'] = data['text'].apply(lambda x: clean_data(x))

In [None]:
for i in range(len(data)):
    data['text'][i] = data['text'][i].replace('rt ','')

In [None]:
#Converting the dataframe into list
reviews = data['text'].tolist()
label = data['label'].tolist()

corpus = []
for i in range(len(reviews)):
    corpus.append(word_tokenize(reviews[i]))

In [None]:
#Tokenization
max_features = 1200 #number of words to keep. 1200 is the number of unique words in the corpus.
tokenizer = Tokenizer(nb_words=max_features, split=' ')
tokenizer.fit_on_texts(data['text'].values)



In [None]:
#Unique words and and their count
tokenizer.word_counts

OrderedDict([('amar', 28),
             ('sosur', 1),
             ('barir', 1),
             ('iftari', 1),
             ('toh', 14),
             ('tai', 7),
             ('jnhei', 1),
             ('sir', 4),
             ('boos', 1),
             ('2', 7),
             ('hit', 3),
             ('movie', 22),
             ('hobe', 28),
             ('eid', 16),
             ('ar', 28),
             ('sera', 1),
             ('ranasarkar', 14),
             ('kgunedited', 9),
             ('idevadhikari', 26),
             ('subhashreesotwe', 13),
             ('aroyfloyd', 7),
             ('dev', 9),
             ('neyni', 1),
             ('mane', 5),
             ('apnie', 1),
             ('ta', 53),
             ('atke', 1),
             ('rekhesen', 1),
             ('bristy', 1),
             ('chuye', 1),
             ('tahsan', 1),
             ('moutushi', 1),
             ('etota', 2),
             ('valobashi', 1),
             ('apurbo', 1),
             ('ishika', 1),


In [None]:
#Dictionary index of a word
tokenizer.word_index["fresh"]

525

In [None]:
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X, padding = 'post') #Zero padding at the end of the sequence

In [None]:
Y = to_categorical(Y)
print(Y)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]]


In [None]:
#Splitting the data into train data and test data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

### RNN, LSTM, Stacked-RNN, Stacked-LSTM, Bidirectional-RNN, Bidirectional-LSTM

In [None]:
embed_dim = 500
hidden_layer = 100

model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X.shape[1]))
model.add(Dropout(0.2))
model.add(SimpleRNN(hidden_layer))
model.add(Dense(3,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 26, 500)           600000    
                                                                 
 dropout (Dropout)           (None, 26, 500)           0         
                                                                 
 simple_rnn (SimpleRNN)      (None, 100)               60100     
                                                                 
 dense (Dense)               (None, 3)                 303       
                                                                 
Total params: 660,403
Trainable params: 660,403
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
from tensorflow.keras.utils import to_categorical

# make the model and load the training dataset.

y_train = to_categorical(y_train)

# call the fit method.

In [None]:
y_train.shape


(320, 8, 2)

In [None]:
X_train.shape

TensorShape([320, 26])

In [None]:
batch_size = 32
model.fit(X_train, y_train, epochs = 10, batch_size= batch_size)

Epoch 1/10


ValueError: ignored

In [None]:
score = model.evaluate(X_test, y_test, verbose = 1, batch_size = batch_size)
print("Accuracy: %.2f" % (score[1]*100))

In [None]:
test = data['text'][0]
test = clean_data(test)
test = test.replace('rt ','')
test = [test]
test = tokenizer.texts_to_sequences(test)
test = pad_sequences(test, maxlen=24, padding = 'post')

In [None]:
class_label = model.predict_classes(test)
print(le.inverse_transform(class_label))

### CNN-RNN and CNN-LSTM

In [None]:
# Convolution
kernel_size = 5
filters = 64
pool_size = 4

In [None]:
model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = X.shape[1], dropout=0.2))
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(LSTM(hidden_layer))
model.add(Dense(3))
model.add(Activation('sigmoid'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [None]:
batch_size = 32
model.fit(X_train, y_train, epochs = 10, batch_size=batch_size)

In [None]:
score = model.evaluate(X_test, y_test, verbose = 1, batch_size = batch_size)
print("Accuracy: %.2f" % (score[1]*100))

### Using word2vec in Keras 

In [None]:
#Reading the data
data = pd.read_csv('sa_data.csv')

In [None]:
def clean_data(tweet):
    x = re.sub("[^a-zA-Z0-9]", " ",tweet) 
    x = x.lower().split()                             
    stops = set(stopwords.words("english"))                  
    words = [w for w in x if not w in stops] 
    return( " ".join(words)) 

In [None]:
data['text'] = data['text'].apply(lambda x: clean_data(x))

for i in range(len(data)):
    data['text'][i] = data['text'][i].replace('rt ','')

In [None]:
reviews = data['text'].tolist()
sentiment = data['sentiment'].tolist()

corpus = []
for i in range(len(reviews)):
    corpus.append(word_tokenize(reviews[i]))

In [None]:
wv_model = Word2Vec(corpus, size=100, window=5, min_count=1)
w2v_model_wv = wv_model.wv

In [None]:
#Tokenization
max_features = 1200 #number of words to keep. 1200 is the number of unique words in the corpus.
tokenizer = Tokenizer(nb_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X, padding = 'post') #Zero padding at the end of the sequence
word_index = tokenizer.word_index

In [None]:
maxlen = len(X[0])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
print('Shape of X_train and X_test data:', X_train.shape,X_test.shape)
print('Shape of y_train and y_test:', y_train.shape,y_test.shape)

In [None]:
nb_words = len(wv_model.wv.vocab)
embedding_dimension = 100

embedding_matrix = np.zeros((nb_words, embedding_dimension))
for word, i in word_index.items():
    if word in wv_model.wv.vocab:
        embedding_matrix[i] = wv_model.wv[word]

In [None]:
wv_layer = Embedding(nb_words,embedding_dimension,weights=[embedding_matrix], input_length=maxlen,
                     trainable=True)
input_tweet = Input(shape=(maxlen,))
embedded_sequences = wv_layer(input_tweet)
x = SimpleRNN(64, return_sequences=False)(embedded_sequences)
preds = Dense(3, activation='sigmoid')(x)

In [None]:
model = Model(inputs=[input_tweet], outputs=preds)

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [None]:
model.fit([X_train], y_train, validation_split=0.1,epochs=10, batch_size=256, shuffle=True)

In [None]:
score = model.evaluate(X_test, y_test, verbose = 2, batch_size = batch_size)
print("Accuracy: %.2f" % (score[1]*100))