In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import xticks
from nltk.corpus import stopwords
import nltk
import re
from nltk.stem import WordNetLemmatizer
import string
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import defaultdict
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
import tensorflow as tf
from sklearn.metrics import f1_score
from wordcloud import WordCloud,STOPWORDS
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from keras.preprocessing.sequence import pad_sequences
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense,Flatten,Embedding,Activation,Dropout
from keras.layers import Conv1D,MaxPooling1D,GlobalMaxPooling1D,LSTM
from keras.layers import Bidirectional

In [None]:
train= pd.read_csv('../input/bitcoin-tweets-14m/cleanprep.csv',nrows=20000)
train.shape

In [None]:
train.columns=['date','name','text','sentiment','polarity']
train=train.drop(['date','name','polarity'],axis=1)
train.head()

In [None]:
#lets save stopwords in a variable
stop = list(stopwords.words("english"))
print(stop)

In [None]:
# save list of punctuation/special characters in a variable
punctuation = list(string.punctuation)
print(punctuation)

In [None]:
# create an object to convert the words to its lemma form
lemma = WordNetLemmatizer()

In [None]:
# lets make a combine list of stopwords and punctuations
sw_pun = stop + punctuation

In [None]:
# function to preprocess the messages
def preprocess(tweet):
    tweet = re.sub(r"https?:\/\/t.co\/[A-Za-z0-9]+", "", tweet) # removing urls 
    tweet = re.sub('[^\w]',' ',tweet) # remove embedded special characters in words (for example #earthquake)         
    #tweet = re.sub('[\d]','',tweet) # this will remove numeric characters
    tweet = tweet.lower()
    words = tweet.split()  
    sentence = ""
    for word in words:     
        if word not in (sw_pun):  # removing stopwords & punctuations                
            word = lemma.lemmatize(word,pos = 'v')  # converting to lemma    
            if len(word) > 3: # we will consider words with length  greater than 3 only
                sentence = sentence + word + ' '             
    return(sentence)

In [None]:
# apply preprocessing functions on the train and test datasets
train['text'] = train['text'].apply(lambda s : preprocess(s))

In [None]:
# function to remove emojis
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
# applying the function on the train and the test datasets
train['text'] = train['text'].apply(lambda s : remove_emoji(s))

# Vocabulary creation
Lets create our own vocabulary

In [None]:
# function to create vocab
from collections import Counter
def create_vocab(df):
    vocab = Counter()
    for i in range(df.shape[0]):
        vocab.update(df.text[i].split())
    return(vocab)

In [None]:
# concatenate training and testing datasets
master=train.reset_index(drop=True)

# call vocabulary creation function on master dataset
vocab = create_vocab(master)

# lets check the no. of words in the vocabulary
len(vocab)

In [None]:
# lets check the most common 50 words in the vocabulary
vocab.most_common(50)


lets consider only those words which have appeared more than once in the corpus


In [None]:
# create the final vocab by considering words with more than one occurence
final_vocab = []
min_occur = 2
for k,v in vocab.items():
    if v >= min_occur:
        final_vocab.append(k)

In [None]:
# lets check the no. of the words in the final vocabulary
vocab_size = len(final_vocab)
vocab_size

Now lets apply this vocab on our train and test datasets, we will keep only those words in training and testing datasets which appear in the vocabulary

In [None]:
# function to filter the dataset, keep only words which are present in the vocab
def filter(tweet):
    sentence = ""
    for word in tweet.split():  
        if word in final_vocab:
            sentence = sentence + word + ' '
    return(sentence)

In [None]:
# apply filter function on the train and test datasets
train['text'] = train['text'].apply(lambda s : filter(s))
train.sample(10)

# Data Preprocessing

In [None]:
from keras.preprocessing.text import Tokenizer
# fit a tokenizer
def create_tokenizer(lines):
    # num_words = vocab_size will create a tokenizer,configured to only take into account the vocab_size(6025)
    tokenizer = Tokenizer(num_words=vocab_size)
    # Build th word index, Turns strings into lists of integer indices
    tokenizer.fit_on_texts(lines) 
    return tokenizer

In [None]:
# create and apply tokenizer on the training dataset
tokenizer = create_tokenizer(train.text)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Nous allons maintenant appliquer la fonction text_to_matrix () pour convertir du texte en vecteurs.

La fonction text_to_matrix () sur le Tokenizer peut être utilisée pour créer un vecteur par document fourni par entrée. La longueur des vecteurs est la taille totale du vocabulaire, qui est de 6025 ici (nous avons passé 6025 en tant que num_words dans tokenizer)


* ‘binary‘: Si chaque mot est présent ou non dans le document. C'est la valeur par défaut.
* ‘count‘: Le nombre de chaque mot dans le document.
* ‘tfidf‘: The Text Frequency-Inverse DocumentFrequency (TF-IDF) scoring for each word 
* ‘freq‘: La fréquence de chaque mot dans le document.

In [None]:
# converting texts into vectors
train_text = tokenizer.texts_to_matrix(train.text, mode = 'freq')

# Model Building & Evaluation

## Neural Network

Nous allons créer un réseau de neurones artificiels, les sentiments sont évalués par la fonction f1, qui ne sont pas affichés par défaut après chaque époque, donc créons une fonction pour obtenir le score.

In [None]:
# Test train split 
X_train, X_test, y_train, y_test = train_test_split(train_text, train.sentiment, test_size = 0.2, random_state = 42)

In [None]:
def define_model(n_words):
    # define network
    model = Sequential()
    model.add(Dense(1024, input_shape=(n_words,), activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(512,activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(256,activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])
    
    model.summary()
    
    return model

In [None]:
X_train.shape

In [None]:
callbacks_list = [EarlyStopping(monitor='accuracy',patience=10,),
                  ModelCheckpoint(filepath='./NN.h5',monitor='val_loss',save_best_only=True)]

In [None]:
# create the model
n_words = X_train.shape[1]
model = define_model(n_words)

In [None]:
history = model.fit(X_train,y_train,epochs=20,
                    verbose=2,
                    callbacks=callbacks_list,
                    validation_split=0.2)

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
import keras

# load the model from disk
loaded_model_NN = keras.models.load_model('./NN.h5',custom_objects=dependencies)

# prediction on the test dataset
#X_test_Set = tokenizer.texts_to_matrix(X_test, mode = 'freq')
y_pred = loaded_model_NN.predict_classes(X_test)

# Predictions on the test dataset

In [None]:
'''

test=pd.read_csv('../input/tweets/live_tweet.csv')

test_id = test.tweet
test.drop(["id","date","name"],1,inplace = True)

#apply tokenizer on the test dataset
test_set = tokenizer.texts_to_matrix(test.text, mode = 'freq')

# make predictions on the test dataset
y_test_pred = loaded_model_NN.predict_classes(test_set)


result = pd.DataFrame()
result.to_csv('prediction_NN.csv',index=False)
'''

# Model utilisant les Word Embeddings

Un autre moyen populaire et puissant d'associer un vecteur à un mot est l'utilisation de vecteurs de mots denses, également appelés `word embeddings`. 

La couche Embedding est un dictionnaire qui mappe des indices entiers (qui représentent des mots spécifiques) à des vecteurs denses. Il prend des entiers en entrée, il recherche ces entiers dans un dictionnaire interne et il renvoie les vecteurs associés. Il s’agit en fait d’une recherche dans le dictionnaire.

Alors que les vecteurs obtenus par encodage one-hot sont binaires, clairsemés (principalement constitués de zéros) et de très haute dimension (même dimensionnalité que le nombre de mots dans le vocabulaire), les embeddings de mots sont des vecteurs à virgule flottante de faible dimension (c'est-à-dire , vecteurs denses, par opposition aux vecteurs clairsemés); 

Contrairement aux vecteurs de mots obtenus via un encodage one-hot, les embeddings de mots sont appris à partir de données. Il est courant de voir des word embeddings de dimensions 256 , 512 ou 1 024 lorsqu'il s'agit de vocabulaires très volumineux.

D'autre part, one-hot encoding conduisent généralement à des vecteurs de 20 000 dimensions ou plus (capturant un vocabulaire de 6 025 tokens). Ainsi, les word embeddings regroupent plus d'informations dans beaucoup moins de dimensions.

In [None]:
from keras.layers import Embedding
# La couche Embedding prend au moins deux arguments: le nombre de jetons tokens et la dimension des embeddings (here, 64).
embedding_layer = Embedding(vocab_size, 64)

In [None]:
# Nombre de mots à considérer comme caractéristiques
max_features = vocab_size

# Coupe le texte après ce nombre de mots (parmi les max_features les mots les plus courants)
maxlen = 100

In [None]:
# créer et appliquer un tokenizer sur l'ensemble de données d'entraînement
tokenizer = create_tokenizer(train.text)

In [None]:
from keras import preprocessing
# conversion de texte en séquences
sequences = tokenizer.texts_to_sequences(train.text)
for i in [20,300,43]:
    print('La phrase % a été transcrite en '%tokenizer.sequences_to_texts([sequences[i]]),sequences[i])

In [None]:
# Turns the lists of integers into a 2D integer tensor of shape (samples, maxlen), padding shorter sequences with 0s
train_text = preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)

In [None]:
# Test train split 
X_train, X_test, y_train, y_test = train_test_split(train_text, train.sentiment, test_size = 0.2, random_state = 42)

### Neural Network with Embedding Layer

In [None]:
model = Sequential()
# Spécifie la longueur d'entrée maximale du Embedding layer afin que vous puissiez ultérieurement aplatir les entrées embedded. 

# Après le calque Embedding, les activations ont la forme (samples, maxlen, 8)
model.add(Embedding(vocab_size, 8, input_length=maxlen))

# Flattens the 3D tensor of embeddings into a 2D tensor of shape (samples, maxlen * 8)
model.add(Flatten())

# Dense layer for classification
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
callbacks_list = [EarlyStopping(monitor='accuracy',patience=1,),
                  ModelCheckpoint(filepath='./embd.h5',monitor='val_loss',
                                  save_best_only=True)]

In [None]:
# train the model
history = model.fit(np.asarray(X_train).astype(np.float32),y_train,
                    epochs=10,batch_size=32,
                    callbacks=callbacks_list,validation_split=0.2)

In [None]:
# check model performance
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
# load the model from disk
loaded_model_embd = keras.models.load_model('./embd.h5',custom_objects=dependencies)

nous ne gardons que les 20 premiers mots de chaque tweets. Mais notez que le simple aplatissement des séquences incorporées et la formation d'une seule couche dense sur le dessus conduit à un modèle qui traite chaque mot de la séquence d'entrée séparément, sans prendre en compte les relations entre les mots et la structure des phrases (par exemple, ce modèle traiterait probablement les deux " ce film est une bombe »et« ce film est la bombe »comme étant des critiques négatives). 

Il est préférable d’ajouter des couche recurrent layers ou 1D convolutional layers au-dessus des embedded sequences pour apprendre les fonctionnalités qui prennent en compte chaque séquence dans son ensemble.

## SIMPLE RNN

In [None]:
from keras.layers import Embedding, SimpleRNN
model = Sequential()
model.add(Embedding(vocab_size, 32))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
callbacks_list = [EarlyStopping(monitor='accuracy',patience=1,),
                  ModelCheckpoint(filepath='./SRNN.h5',monitor='val_loss',save_best_only=True)]

In [None]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train,
                    epochs=20,batch_size=32,
                    callbacks=callbacks_list,
                    validation_split=0.2)

In [None]:
# check model performance
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:

# load the model from disk
loaded_model_SRNN = keras.models.load_model('./SRNN.h5',custom_objects=dependencies)
y_pred = loaded_model_SRNN.predict_classes(X_test)

## Stack multiple SimpleRNN layers

In [None]:
max_words=20000

from keras.layers import Embedding, SimpleRNN
model = Sequential()
model.add(Embedding(max_words, 32))
model.add(SimpleRNN(32,return_sequences=True))
model.add(SimpleRNN(32,return_sequences=True))
model.add(SimpleRNN(32,return_sequences=True))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
callbacks_list = [EarlyStopping(monitor='accuracy',patience=1,),
                  ModelCheckpoint(filepath='./STRNN.h5',monitor='val_loss',save_best_only=True)]

In [None]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train,
                    epochs=20,batch_size=32,
                    callbacks=callbacks_list,
                    validation_split=0.2)

In [None]:
# check model performance
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

## LSTM

In [None]:
from keras.layers import LSTM

model = Sequential()
model.add(Embedding(max_features, 32))
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])

callbacks_list = [EarlyStopping(monitor='accuracy',patience=1,),
                  ModelCheckpoint(filepath='./LSTM.h5',monitor='val_loss',save_best_only=True)]

In [None]:
history = model.fit(X_train, y_train,epochs=20,
                    batch_size=32,callbacks=callbacks_list,
                    validation_split=0.2)

In [None]:
# check model performance
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

## Bi-Direction LSTM

In [None]:
model = Sequential()
model.add(Embedding(max_features, 32))
model.add(Bidirectional(LSTM(32,return_sequences=True)))
model.add(Bidirectional(LSTM(32,return_sequences=True)))
model.add(Bidirectional(LSTM(32,return_sequences=True)))
model.add(Bidirectional(LSTM(32)))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])

callbacks_list = [EarlyStopping(monitor='accuracy',patience=1,),
                  ModelCheckpoint(filepath='./BILSTM.h5',monitor='val_loss',save_best_only=True)]

In [None]:
history = model.fit(X_train, y_train,epochs=20,
                    batch_size=32,callbacks=callbacks_list,
                    validation_split=0.2)

In [None]:
# check model performance
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

Neural Network with Embedding layer seems to the best model for this classification task.

## GRU

In [None]:
from keras.layers import GRU

model = Sequential()
model.add(Embedding(max_features, 32))
model.add(GRU(32))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])

callbacks_list = [EarlyStopping(monitor='accuracy',patience=1,),
                  ModelCheckpoint(filepath='./GRU.h5',monitor='val_loss',save_best_only=True)]

In [None]:
history = model.fit(X_train, y_train,epochs=20,
                    batch_size=32,callbacks=callbacks_list,
                    validation_split=0.2)

In [None]:
# check model performance
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

## Stacked GRU

In [None]:
from keras.layers import GRU

model = Sequential()
model.add(Embedding(max_features, 32))
model.add(GRU(32,return_sequences=True))
model.add(GRU(32,return_sequences=True))
model.add(GRU(32,return_sequences=True))
model.add(GRU(32))
model.add(Dense(1, activation='sigmoid'))

model.summary()

model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])

callbacks_list = [EarlyStopping(monitor='accuracy',patience=1,),
                  ModelCheckpoint(filepath='./SGRU.h5',monitor='val_loss',save_best_only=True)]

In [None]:
history = model.fit(X_train, y_train,epochs=20,
                    batch_size=32,callbacks=callbacks_list,
                    validation_split=0.2)

In [None]:
# check model performance
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

## Stacked GRU with Dropouts

In [None]:
from keras.layers import GRU

model = Sequential()
model.add(Embedding(max_features, 32))
model.add(GRU(32,dropout=0.2,recurrent_dropout=0.2,return_sequences=True))
model.add(GRU(32,return_sequences=True))
model.add(GRU(32,dropout=0.2,recurrent_dropout=0.2,return_sequences=True))
model.add(GRU(32,return_sequences=True))
model.add(GRU(32,dropout=0.2,recurrent_dropout=0.2,return_sequences=True))
model.add(GRU(32))
model.add(Dense(1, activation='sigmoid'))

model.summary()

model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['accuracy'])

callbacks_list = [EarlyStopping(monitor='accuracy',patience=1,),
                  ModelCheckpoint(filepath='./DSGRU.h5',monitor='val_loss',save_best_only=True)]

In [None]:
history = model.fit(X_train, y_train,epochs=20,
                    batch_size=32,callbacks=callbacks_list,
                    validation_split=0.2)

In [None]:
# check model performance
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()