# Challenge START&GO Data Science 2020 "Botando pra Quebrar" 
# Sentiment Analysis in IMDB Reviews and Tweets with Natural Language Processing (NLP) 



Hello! We are Andreis Purim and Eduarda Agostini, we're two brazilian students in currently doing a double-degree program at the Ã‰cole Centrale de Lille, in France. For our challenge in Data Science, we chose to make a mix of NLP algorithms in multiple intersting datasets.

All documentation in this notebook will be in english because we believe it may be intersting to a wider audience. I hope you like it.

# 1. IMDB

Our first challenge will be to make a Machine Learning classifier for the 50k movie reviews IMDB Dataset. The dataset itself is very simple: the review and the sentiment (positive or negative).

## 1.1. Visualizing our Data

Let's start by plotting some graphs and of course, seeing how our data works.

In [None]:
import gc
import numpy
import pandas
%matplotlib inline

Reviews = pandas.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
print(Reviews.shape)
print(Reviews.head())

Let's use the [spaCy library](https://spacy.io/models) to take a look at how our phrases are constructed. spaCy is a beautifully constructed library for NLP that has a pretrained statistical models in various languages. We could use the starter models to make a transfer learning, but for now let's just use the complete model to see how it fares.

By loading the English core pretrained models, we can use it to deconstruct the phrase and see every part of it (with explanations!). Let's choose our second review, in this case, I don't want to print all words (because it'd be too huge), so I made a zip with range(20), in case you want to observe all words, just make a for in Chosen_Sentence

In [None]:
import string
import torch
import spacy
import nltk
import re

from IPython.display import clear_output
from nltk.corpus import stopwords
from collections import Counter
from spacy import displacy

nltk.download('stopwords')
stopwords_dict = Counter(stopwords.words("english"))
clear_output()

def clear_sentence(sentence: str) -> str:
    sentence = sentence.replace('<br />', ' ')
    sentence = sentence.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    sentence = sentence.lower()
    return sentence

def remove_stopwords(sentence: str) -> list:
    return [word for word in sentence.split() if not word in stopwords_dict]

def tokenize(sentence:str) -> list:
    sentence = sentence.replace('<br />', ' ')
    sentence = sentence.lower()
    return word_tokenize(sentence)

# And let's clean our reviews
print("Cleaning sentences...")
Reviews['review'] = Reviews['review'].apply(clear_sentence)
print("Removing stopwords...")
Reviews['no_stopwords'] = Reviews['review'].apply(remove_stopwords)
clear_output()

def render_spacy(chosen_sentence: str):
    spaCy = spacy.load('en_core_web_sm')
    spacy_sentence = spaCy(chosen_sentence)
    print('-'*15,'Review text','-'*15)
    for i,word in zip(range(20),spacy_sentence):
        print(f'{word.text:{12}} {word.pos_:{10}} {word.tag_:{8}} {spacy.explain(word.tag_)}')
    print('\n','-'*15,'Entities','-'*15)
    for entity in spacy_sentence.ents:
        print(entity.text + ' - ' + entity.label_ + ' - ' + str(spacy.explain(entity.label_)))

    displacy.render(spacy_sentence, style='ent', jupyter=True)
    displacy.render(spacy_sentence, style='dep', jupyter=True, options={'distance': 50})


clear_output()
render_spacy(Reviews['review'][1])
torch.cuda.empty_cache()
memory_clear = gc.collect()

Note how our data still needs some cleaning. For example, there are HTML tags (like ```<br/>```) which spaCy classifies as "superfluous punctuation". We'll clean the text before starting our models.

Another beautiful thing about spaCy is displaCy, displaCy is a visualizer which not only makes visualizing dependencies in NLP fun but also very helpful for us. 

Another powerful thing spaCy can do is to identify entities in the text, like organizations, people, nationalities, etc... you can use displaCy again to visualize the entities in the text.

Ok, now that we had enough fun with spaCy, let's start to visualize our dataset as a whole. The first step is to clean the reviews of ponctuation, tags and then put everything in lowercase. Our objective in this cell is to know the number of positive and negatives (spoiler: it should be 25000 and 25000) and see how the number of words is related to the sentiment.

In [None]:
import matplotlib.pyplot as matplotlib
import seaborn

def EDA_function():
    # Let's take a small look on how many posivies and negatives our reviews have
    # for completeness sake, let's look at both sentences with stopwords and without them.
    EDA = Reviews.copy()
    EDA['#_no_stopwords'] = EDA['no_stopwords'].apply(len)
    EDA['#_with_stopwords'] = EDA['review'].apply(str.split).apply(len)
    
    # Let's prepare our plot
    matplotlib.figure(figsize=(8,8))

    # Let's transform those columns in a dictionary that contains the lists of the numbers of words (with and without stopwords)
    words_EDA = {sentiment: [] for sentiment in ['total','positive','negative']}
    for sentiment in words_EDA:
        words_EDA[sentiment].append(('Without',EDA[EDA['sentiment'] == sentiment]["#_no_stopwords"] if sentiment !='total' else EDA["#_no_stopwords"]))
        words_EDA[sentiment].append(('With',EDA[EDA['sentiment'] == sentiment]["#_with_stopwords"] if sentiment !='total' else EDA["#_with_stopwords"]))
        print(len(words_EDA[sentiment][1][1]),sentiment,'reviews:')
        for choice in words_EDA[sentiment]:
            print(' '*2,choice[0],'stopwords:\trange', '[' + str(choice[1].min()),'-', str(choice[1].max()) + ']','\tmean:',round(choice[1].mean()), '- standart:',round(choice[1].std()))
        if sentiment != 'total':
            seaborn.distplot(words_EDA[sentiment][1][1],kde=False, label=sentiment.capitalize())
        
    matplotlib.xlabel("Number of Words per Phrase")
    matplotlib.ylabel("Number of Phrases")
    matplotlib.legend()
    matplotlib.show()

def EDA_common_words() -> pandas.DataFrame:
    # Use counter to get the most common words
    dataframes = [pandas.DataFrame(Counter([item for sublist in Reviews['no_stopwords'] for item in sublist]).most_common(20))]
    for sentiment in ['negative','positive']:
        dataframes.append(pandas.DataFrame(Counter([item for sublist in Reviews[Reviews['sentiment'] == sentiment]['no_stopwords'] for item in sublist]).most_common(20)))
    dataframe = pandas.concat([dataframes[0],dataframes[1],dataframes[2]], axis=1)
    dataframe.columns = ['Common Words','#C','Negative Words','#N','Positive Words','#P']
    return dataframe
    
EDA_function()
EDA_common_words().style.background_gradient(cmap='Greys', subset='#C').background_gradient(cmap='Reds', subset='#N').background_gradient(cmap='Greens', subset='#P')

So, as you can probably see, just getting the most common words don't tell us a lot - even after we remove stop words, because considering only numerical values detract from the meaning of the text. That's why in negative reviews, for example, we have "good" appearing multiple times, (we don't see bad appearing in positive reviews, on the other hand). 

So let's jump ahead to something that can help us extract relation between words: Vectorized words! The idea is that every word will have a unique vector in an N Dimensional space. Therefore, if two vectors are close in the space, they are related words, if they have completely opposite numbers, they are opposite words, and so on...

Here's a demonstration of how these words look like.

Ok, so let's go ahead and use word2vec, the most famous Word Vectorizer\Tokenizer and see how it looks like in a 2D graphic with the labels on. Let's also take a look at the w2vec similar_words finder

In [None]:
import plotly.graph_objs as plotpygo
import plotly.offline as plotplyoff
from gensim.models import word2vec
from sklearn.manifold import TSNE

# Clear memory from last cell 
torch.cuda.empty_cache()
memory_clear = gc.collect()

def EDA_word2vec():
    tokenized = Reviews['no_stopwords'][0:1000].tolist()

    # Now let's use Word2Vec
    w2v = word2vec.Word2Vec(tokenized, size=100, window=30, min_count=1, sample=1e-3, iter=50)

    # This is a list of terms we want to find, we are going to search for the 7 most related words (and the pad it to 5, to remove any repetition)
    terms = ['movie','bad','good','sucks','awesome','action','documentary','benedict']
    related_words = {search_term: [search_term]+[item[0] for item in w2v.wv.most_similar([search_term], topn=7) if item[0] not in terms][:5] for search_term in terms}
    for word in related_words:
        print(word,"is related to",related_words[word][1:])

    # Now, we make a list of our dictionary values and get the word2vec vectors of each one.
    # And then we use TSNE to make them 2D or 3D
    words = sum(related_words.values(), [])
    word_vectors_ND = w2v.wv[words]
    word_vectors_2D = TSNE(n_components=2, random_state=0, n_iter=500, perplexity=1).fit_transform(word_vectors_ND)
    word_vectors_3D = TSNE(n_components=3, random_state=0, n_iter=500, method='barnes_hut', angle=0.5, perplexity=3).fit_transform(word_vectors_ND)

    # Plot in 2D
    matplotlib.figure(figsize=(14, 8))
    seaborn.color_palette("tab10")
    seaborn.scatterplot(word_vectors_2D[:, 0], word_vectors_2D[:, 1], s=200)
    for word, x, y in zip(words, word_vectors_2D[:, 0], word_vectors_2D[:, 1]):
        matplotlib.annotate(word, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points')
    matplotlib.show()
    
    # Plot in 3D
    data = [plotpygo.Scatter3d(x=word_vectors_3D[:,0],y=word_vectors_3D[:,1], z=word_vectors_3D[:,2], mode='markers')]
    layout = dict(height=800, width=800, title='(this is a 3D graphic, take a look around with your mouse)')
    plotplyoff.iplot(dict(data=data, layout=layout), filename='3DBubble')
    
EDA_word2vec()
torch.cuda.empty_cache()
memory_clear = gc.collect()

Cool right? You can probably see that when we vectorize our words, we have a much easier time finding words that are somewhat related to one another.

## 1.2 Choosing algorithms for our IMDB

Ok, now that we know how our data looks like, let's choose some Machine Learning algorithms to work with our data. The first thing we need to do is to reduce our dataset (because some of these algorithms can take quite a while), so we'll be using only the first 5000 reviews.




In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC, SVC, NuSVC
from matplotlib.lines import Line2D
from xgboost import XGBClassifier
import time

def compare_ML():
    # Let's get a slightly bigger part to test a few Machine Learning Algorithms.
    # Makes two datasets, x and y, x will be the clear reviews and y will be the sentiment
    x = Reviews['review'][0:5000].tolist()
    y = Reviews['sentiment'][0:5000].tolist()
    # Split the dataset in a 80%/20% fashion
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

    # I'm making two dictionaries, one for models to transform our words in vectors and the other of models to work on these vectors
    Vectorizer_Models = {
        'Count': CountVectorizer(),
        'Hash': HashingVectorizer(),
        'Tfidf': TfidfVectorizer(ngram_range=(1, 2))
    }

    ML_Models = {
        'LinearSVC': LinearSVC(),
        'SVC': SVC(),
        'NuSVC': NuSVC(),
        'DecisionTree': DecisionTreeClassifier(),
        'XGBClassifier': XGBClassifier(),
        'RandomForest': RandomForestClassifier(n_estimators=100, random_state=0),
        'SGDC': SGDClassifier(),
        'MultiNB': MultinomialNB(),    
    }

    # I'll make a new list for a future dataset to see the accuracy and time of each algorithm.
    plotting_data = []
    
    for j,vector in enumerate(Vectorizer_Models):

        # Let's first vectorize, because the vectorized words will be used in common by all MLs. Also, starts counting the time to vectorize.
        time_vector_start = time.time()
        X_train_vectorized = Vectorizer_Models[vector].fit_transform(X_train) 
        X_test_vectorized= Vectorizer_Models[vector].transform(X_test)
        time_vector_end = time.time()

        for i,ml in enumerate(ML_Models):

            # Small detail: Multinomial Naive-Baise does not work with negative numbers, so we can just use him with Count
            if (ml == 'MultiNB' and vector != 'Count') == False:
               
                # Ok, let's start the time and put our models to fit the data.
                starting_time = time.time()
                model = ML_Models[ml]
                model.fit(X_train_vectorized, y_train)

                # Predict the data and try to find the accuracy
                y_predicted = model.predict(X_test_vectorized)
                accuracy = accuracy_score(y_test, y_predicted)
                ending_time = time.time()

                # Now, get the times and append everything in our plotting data.
                cut_time = round(time_vector_end - time_vector_start,2)
                ml_time = round(ending_time - starting_time,2)
                plotting_data.append([ml,vector,accuracy,ml_time,cut_time,cut_time+ml_time])


    # Makes a pandas dataset for our data (for better visualization)
    plot_times = pandas.DataFrame(plotting_data, columns=['ML','Vectorizer','Accuracy','ML_Time','Cut_time','Total_time'])

    # Now, let's make a Seaborn scatterplot
    seaborn.set(color_codes=True)
    matplotlib.figure(figsize=(12, 8))
    matplotlib.title("Best vectorization and Accuracy Algorithms (Reduced Dataset)")

    ax = seaborn.scatterplot(data=plot_times, x='Total_time', y='Accuracy', hue='ML', style='Vectorizer')
    matplotlib.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    ax.set(xlabel="Time (s)", ylabel="Accuracy")
    matplotlib.show()
    return plot_times

def complete_ML_chosen():
    # Do everything again
    starting_time = time.time()
    x = Reviews['review'].tolist()
    y = Reviews['sentiment'].tolist()
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
    vector = TfidfVectorizer(ngram_range=(1, 2))
    X_train_vectorized = vector.fit_transform(X_train) 
    X_test_vectorized= vector.transform(X_test)
    model = LinearSVC()
    model.fit(X_train_vectorized, y_train)
    y_predicted = model.predict(X_test_vectorized)
    accuracy = accuracy_score(y_test, y_predicted)
    ending_time = time.time()
    print('COMPLETE DATASET WITH LinearSVC\nAccuracy:',"{:.2f}".format(accuracy*100),"in","{:.2f}s".format(ending_time-starting_time))
    print(confusion_matrix(y_test, y_predicted))

complete_ML_chosen()
compare_ML()

As you have probably seen in the graph, the best algorithm is probably LinearSVC using Tfidf or Hash, with a small difference in accuracy and time, while things like Hash with XGB or RandomForest probably fared pretty bad in time.

I won't explain in great detail why (if you google you'll probably find better answers) but this is because LinearSVC are Support Vector Machine, that is, machine learning algorithms made to use vectors as inputs, while RandomForest, while a very good algorithm, just can't handle vectors with hundreds of dimensions in a good time. In this case, you can see SGDClassifier rates a little higher than LinearSVC because SGD (Stochastic Gradient Descent) is a good approach of fitting linear classifiers in a manner similar to SVM.

In fact, our SGDClassifier is LinearSVM with some better training, as the docs in scikit state: "Strictly speaking, SGD is merely an optimization technique and does not correspond to a specific family of machine learning models. It is only a way to train a model. Often, an instance of SGDClassifier or SGDRegressor will have an equivalent estimator in the scikit-learn API, potentially using a different optimization technique."

And for the vectorizes, you can see Count is way faster, Hash is a mix of fast and accurate, and Tfidf is accurate(r). 

But, when we scale the data back to its 50.000 original size, you might notice LinearSVC might outscore SDGC.

So, let's stick to LinearSVC a little more and do one final thing: fine-tuning. Scikit comes with a nice tool called GridSearchCV that allows us to fine tune our model a little further.

Ideas:
```python
import nltk
from nltk.stem.snowball import SnowballStemmer
```
and fine tune with
```python
from sklearn.model_selection import GridSearchCV
GridSearchCV()
```

We are not going to do it now because of time restrictions.

# 1.3 Deep Learning

Deep learning sometimes is overkill, but hey, maybe we do want to achieve that 99% accuracy, right? So let's explore a little the Deep Learning models available in Keras. Let's start using the X and Y already separated before

In [None]:
from keras.layers import Dense , Input , LSTM , Embedding, Dropout, Conv1D, MaxPooling1D, Activation, GRU, Flatten
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.layers import Dense, Flatten, Convolution1D
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from mpl_toolkits.axes_grid1 import ImageGrid
from keras.layers.embeddings import Embedding
from keras.models import Model, Sequential
from numpy import asarray
from numpy import array
from numpy import zeros
import keras
 
def compare_DL():
    # So, for our IMDB deep learning, we need to tokenize the words in a different manner from our ML.
    to_binary = lambda phrase: 1 if phrase=='positive' else 0
    x = Reviews['review'].tolist()
    y_binary = numpy.array(Reviews['sentiment'].apply(to_binary))
    tokenizer = Tokenizer(num_words=6000)
    tokenizer.fit_on_texts(x)
    x_tokenized = tokenizer.texts_to_sequences(x)

    # Now, in average, our reviews have 128 words, so let's pad the maximum size to 130. We can make it longer but it will also make it slower 
    x_padded = pad_sequences(x_tokenized, maxlen=130)

    # Split
    X_train, X_test, y_train, y_test = train_test_split(x_padded, y_binary, test_size=0.2, random_state=0)

    # Make our model
    def Model1():
        name = 'LSTM'
        model = Sequential()
        model.add(Embedding(6000, 128))
        model.add(Bidirectional(LSTM(32, return_sequences = True)))
        model.add(GlobalMaxPool1D())
        model.add(Dense(20, activation="relu"))
        model.add(Dropout(0.05))
        model.add(Dense(1, activation="sigmoid"))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return (name,model)
    def Model2():
        name = 'CNN (ver. 1)'
        model = Sequential()
        model.add(Embedding(6000, 15, input_length=130))
        model.add(Dropout(0.50))
        model.add(Conv1D(filters=32, kernel_size=2, padding='same', activation='relu'))
        model.add(Dropout(0.50))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Flatten())
        model.add(Dropout(0.50))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return (name,model)
    def Model3():
        name = 'CNN (ver. 2)'
        model = Sequential()
        model.add(Embedding(6000, 32, input_length=130))
        model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
        model.add(MaxPooling1D(pool_size=2))
        model.add(Flatten())
        model.add(Dense(250, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        return (name,model)
    def Model4():
        name = 'LSTM with GloVe'
        embeddings_dictionary = dict()
        with open('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt', encoding="utf8") as glove_file:
            for line in glove_file:
                records = line.split()
                word = records[0]
                vector_dimensions = asarray(records[1:], dtype='float32')
                embeddings_dictionary[word] = vector_dimensions
        vocab_size = len(tokenizer.word_index) + 1
        embedding_matrix = zeros((vocab_size, 100))
        for word, index in tokenizer.word_index.items():
            embedding_vector = embeddings_dictionary.get(word)
            if embedding_vector is not None:
                embedding_matrix[index] = embedding_vector
        model = Sequential()
        embedding_layer = Embedding(vocab_size, 100,  weights=[embedding_matrix], input_length=130 , trainable=False)
        model.add(embedding_layer)
        model.add(Flatten())
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return (name,model)
    def Model5():
        name = 'Multi-Layer Perceptron'
        model = Sequential()
        model.add(Embedding(6000, 32, input_length=130))
        model.add(Flatten())
        model.add(Dense(250, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        return (name,model)

    # I'm not the biggest fan of plotting Epoch data (specially considering we are using just 3 epochs), but here's a function if you want to do it anyway.
    # I'm not calling it because I added an EarlyStopping, so there's not much sense to it.
    def plot_scores(history,name) :
        matplotlib.plot(history.history['accuracy'])
        matplotlib.plot(history.history['val_accuracy'])
        matplotlib.title(name + ' accuracy')
        matplotlib.ylabel('accuracy')
        matplotlib.xlabel('epoch')
        matplotlib.legend(['train','test'], loc = 'upper left')
        return matplotlib.show()

    Models_List = [Model1(),Model2(),Model3(),Model4(),Model5()]

    results = []
    for name,model in Models_List:
        starting_time = time.time()
        es_callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)
        history = model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=128, verbose=False, callbacks=[es_callback])
        scores = model.evaluate(X_test, y_test, verbose=0)
        ending_time = time.time()
        #plot_scores(history,name)
        results.append(name + ": %.2f%%" % (scores[1]*100) + " in {:.2f}s".format(ending_time-starting_time))

    for i in results:
        print(i)
compare_DL()
torch.cuda.empty_cache()
memory_clear = gc.collect()

**IMPORTANT NOTE:** Ok, so this is the end of the bulk of our study of the IMDB dataset - which was our primary objective - but we also wanted to implement BERT and study the Australian election tweets. Since BERT takes a lot of memory, he will be in a separate notebook, that can be acessed here. The rest will be here:

# 2. Let's start looking at tweets!

As promised, our final objective in this challenge is to learn enough about sentiment analysis to read tweets about the australian election. Before we do that, let's play with some other datasets to see how far our models are faring.

# 2.1 Airline Tweets

Our first dataset will be the Airline Tweets Sentiment. It is a smaller dataset and it gives us a precious insight in Twitter Analysis. The result is probably way lower than you expected. Why is that? The main reasons are:

- Tweets are a lot smaller than reviews.
- Reviews tend to be clear and informative (even though there might be sarcasm), tweets do not necessarily are accessible, clear or informative - in fact, most are not.
- Tweets are charged with sarcasm and abbreviations.
- The dataset we are using is smaller, as well.

Let's see how ML and Deep Learning compare

In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.layers import GlobalMaxPooling1D 
from keras.layers import BatchNormalization
from keras.utils import to_categorical
from keras.optimizers import Adam

def Airline_Analysis():
    # Load the data and take a look at how tweet datasets usually look like
    Airlines_Total = pandas.read_csv('../input/twitter-airline-sentiment/Tweets.csv')

    # Now, we won't be using any other data other than the text and the sentiment. We could use location or reason for a in-depth analysis but we'll reserve that to the Australian Tweets.
    Airlines = Airlines_Total[['airline_sentiment','text']]
    print('Dataset shape',Airlines_Total.shape)

    # Use our clear_sentence function made at the very beggining
    x = Airlines['text'].apply(clear_sentence).tolist()
    y = Airlines['airline_sentiment'].tolist()

    # Let's use the SVC model we used before.
    starting_time = time.time()   
    vector = TfidfVectorizer(ngram_range=(1, 2))
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
    X_training = vector.fit_transform(X_train) 
    X_testing = vector.transform(X_test)
    model = LinearSVC()
    model.fit(X_training, y_train)
    y_prediction = model.predict(X_testing)
    accuracy = accuracy_score(y_test, y_prediction)
    ending_time = time.time()
    print('LinearSVC:',"{:.2f}".format(accuracy*100) + " in {:.2f}s".format(ending_time-starting_time))


    # And the CNN (ver. 3) Model 
    starting_time = time.time() 
    tokenizer = Tokenizer(num_words=6000)
    tokenizer.fit_on_texts(x)
    x_tokenized = tokenizer.texts_to_sequences(x)
    x_padded = pad_sequences(x_tokenized, maxlen=130)
    label_encoder = LabelEncoder()
    y_binary = label_encoder.fit_transform(y)
    y_binary = to_categorical(y_binary)
    model = Sequential()
    model.add(Embedding(6000, 100, input_length=130))
    model.add(Conv1D(1024, 3, padding='valid', activation='relu', strides=1))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(2048, activation='relu'))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer=Adam(0.001), loss='categorical_crossentropy', metrics=['accuracy'])

    y_binary = numpy.array(y_binary)
    X_train, X_test, y_train, y_test = train_test_split(x_padded, y_binary, test_size=0.2, random_state=0)
    history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, batch_size=128, verbose=False)
    scores = model.evaluate(X_test, y_test, verbose=0)
    ending_time = time.time()
    print("CNN (ver. 3): %.2f%%" % (scores[1]*100) + " in {:.2f}s".format(ending_time-starting_time))
    # I'm adding the head after because the print obscures it.
    return Airlines.head()

Airline_Analysis()

# 3. The Final Showdown: Let's try to predict the Australian Elections

By this point, we now know how to make some pretty good models to classify sentiments. So let's try to predict something very real: elections. In this case, let's try to use the Australia 2019 Election Tweets and see how's getting love or hate, and then compare it to the real results of the elections.

This dataset has 183.379 tweets, 146.485 of those have locations filled, in the end, we will have 111.137 tweets in Australia, so we can use these tweets to map our analysis in Australia!

The first step is to train our LinearSVC with the sentiment140 tweet database. It is a  database with 1,600,000 tweets (0 = negative, 2 = neutral, 4 = positive).

In [None]:
torch.cuda.empty_cache()
memory_clear = gc.collect()

def train_our_model_in_tweets():
    # Load the data and take a look at how tweet datasets usually look like
    Sentiments = pandas.read_csv('../input/sentiment140/training.1600000.processed.noemoticon.csv', encoding="ISO-8859-1", names=["target", "ids", "date", "flag", "user", "text"])

    # Now, we won't be using any other data other than the text and the sentiment. 
    Sentiments = Sentiments[['target','text']]

    # Make the sentiments strings
    sentiment_value = {0: "negative", 2: "neutral", 4: "positive"}
    decode = lambda label: sentiment_value[int(label)]
    x = Sentiments['text'].apply(clear_sentence).tolist()
    y = Sentiments['target'].apply(decode).tolist()

    # Let's use the SVC model we used before.
    starting_time = time.time()   
    vector = TfidfVectorizer(ngram_range=(1, 2))
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
    X_training = vector.fit_transform(X_train) 
    X_testing = vector.transform(X_test)
    model = LinearSVC()
    model.fit(X_training, y_train)
    y_prediction = model.predict(X_testing)
    accuracy = accuracy_score(y_test, y_prediction)
    ending_time = time.time()
    print('Trained our model in',len(Sentiments.index),'tweets')
    print('Accuracy:',"{:.2f}".format(accuracy*100) + " in {:.2f}s".format(ending_time-starting_time))
    return model

trained_model = train_our_model_in_tweets()
torch.cuda.empty_cache()
memory_clear = gc.collect()

Now that we fitted our model to understand tweets, let's take a look at the dataset of australian elections

In [None]:
from mpl_toolkits.basemap import Basemap

def Predict_Australia(model):
    # Get the dataset, take a look
    Australia_Tweets = pandas.read_csv('../input/australian-election-2019-tweets/auspol2019.csv')

    # There's an aditional CSV with the geolocation, this is nice.
    Australia_geocode = pandas.read_csv('../input/australian-election-2019-tweets/location_geocode.csv')
    Australia_geocode.columns = ['user_location','lat','long']
    Australia_Tweets = Australia_Tweets[['full_text','user_location']]
    Australia_Tweets = pandas.merge(Australia_Tweets, Australia_geocode)
    del Australia_geocode

    # A function to get only the data inside australia (there are many tweets from abroad)
    def get_region(data, bot_lat, top_lat, left_lon, right_lon):
        top = data.lat <= top_lat
        bot = data.lat >= bot_lat
        left = data.long >= left_lon
        right = data.long <= right_lon
        index = top&bot&left&right 
        return data[index]

    Australia_Tweets = get_region(Australia_Tweets,-44,-10,109,156)
    Australia_Tweets = Australia_Tweets.drop(['user_location'],axis=1)

    Australia_Tweets['full_text'] = Australia_Tweets['full_text'].apply(clear_sentence)
    sentiment = pandas.DataFrame(model.predict(vector.transform(Australia_Tweets['full_text'].tolist())),columns=['sentiment'])
    Australia_Tweets = pandas.concat([Australia_Tweets, sentiment], axis=1).dropna()
    
    Australia_map = Basemap(llcrnrlat=-44,urcrnrlat=-10,llcrnrlon=109,urcrnrlon=156)
    matplotlib.figure(figsize=(12,10))
    Australia_map.bluemarble(alpha=0.9)
    
    labor_positive = Australia_Tweets[(Australia_Tweets['full_text'].str.contains('labor')) & (Australia_Tweets['sentiment'] == 'positive')]
    liberal_positive = Australia_Tweets[(Australia_Tweets['full_text'].str.contains('liberal')) & (Australia_Tweets['sentiment'] == 'positive')]
    
    seaborn.scatterplot(x='long', y='lat', data=liberal_positive, alpha=1, s=200, label='Support for Liberals')
    seaborn.scatterplot(x='long', y='lat', data=labor_positive, alpha=0.07, s=200, label='Support for Labor')
    
    matplotlib.gca().get_legend().legendHandles[1].set_alpha(1)
    matplotlib.title("Tweets supporting political parties in Australia, 2019")
    matplotlib.show()

Predict_Australia(trained_model)