## Load Modules

In [0]:
# Install the PyDrive wrapper & import libraries.

!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials


# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Download a file based on its file ID.
# A file ID looks like: laggVyWshwcyP6kEI-y_W3P8D26sz
file_id = '1GhyH4k9C4uPRnMAMKhJYOqa-V9Tqt4q8' ### File ID ###
data = drive.CreateFile({'id': file_id})
#print('Downloaded content "{}"'.format(downloaded.GetContentString()))

## Read the data

In [2]:
import io
import pandas as pd
data = pd.read_csv(io.StringIO(data.GetContentString())) 
data.head()

Unnamed: 0.1,Unnamed: 0,label,tweet
0,0,0.0,user when a father is dysfunctional and is s...
1,1,0.0,user user thanks for lyft credit i can t us...
2,2,0.0,bihday your majesty
3,3,0.0,model i love u take with u all the time in ...
4,4,0.0,factsguide society now motivation


## Load Flair and PyTorch Library

In [3]:
import torch
# !pip install flair
!pip install --upgrade git+https://github.com/flairNLP/flair.git
import flair

Collecting git+https://github.com/flairNLP/flair.git
  Cloning https://github.com/flairNLP/flair.git to /tmp/pip-req-build-vh3fun8b
  Running command git clone -q https://github.com/flairNLP/flair.git /tmp/pip-req-build-vh3fun8b
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: flair
  Building wheel for flair (PEP 517) ... [?25l[?25hdone
  Created wheel for flair: filename=flair-0.4.5-cp36-none-any.whl size=148505 sha256=6b0c101b56c9cbbdeea11c328d44d8b56b98651c87ffd1f113766f848481367c
  Stored in directory: /tmp/pip-ephem-wheel-cache-hmojndlu/wheels/84/82/73/d2b3b59b7be74ea05f2c6d64132efe27df52daffb47d1dc7bb
Successfully built flair
Installing collected packages: flair
  Found existing installation: flair 0.4.5
    Uninstalling flair-0.4.5:
      Successfully uninstalled flair-0.4.5
Successfully installed flair-0.4.5


In [4]:
from flair.data import Sentence
# Create a sentence
sentence = Sentence('Blogs of Analytics Vidhya are Awesome.')
# Print the sentence to see what’s in it
print(sentence) # A Sentence is essentially a list of tokens

Sentence: "Blogs of Analytics Vidhya are Awesome."   [− Tokens: 6]


In [5]:
# Extract only the tweet column from the dataframe
text = data['tweet'] 
# Create a list fo the tweets called txt
txt = text.tolist()
print(txt[:10])

['  user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction     run', ' user  user thanks for  lyft credit i can t use cause they don t offer wheelchair vans in pdx      disapointed  getthanked', '  bihday your majesty', ' model   i love u take with u all the time in ur                                      ', ' factsguide  society now     motivation', '      huge fan fare and big talking before they leave  chaos and pay disputes when they get there   allshowandnogo  ', '  user camping tomorrow  user  user  user  user  user  user  user danny   ', 'the next school year is the year for exams      can t think about that       school  exams    hate  imagine  actorslife  revolutionschool  girl', 'we won    love the land     allin  cavs  champions  cleveland  clevelandcavaliers      ', '  user  user welcome here    i m   it s so  gr    ']


## Word Embedding Using Flair

In [6]:
# Import the embeddings
from flair.embeddings import WordEmbeddings
from flair.embeddings import CharacterEmbeddings
from flair.embeddings import StackedEmbeddings
from flair.embeddings import FlairEmbeddings
from flair.embeddings import BertEmbeddings
from flair.embeddings import ELMoEmbeddings
from flair.embeddings import FlairEmbeddings

# Initialise embeddings (un-comment to use others)
#glove_embedding = WordEmbeddings('glove')
#character_embeddings = CharacterEmbeddings()
flair_forward  = FlairEmbeddings('news-forward-fast')
flair_backward = FlairEmbeddings('news-backward-fast')
#bert_embedding = BertEmbedding()
#elmo_embedding = ElmoEmbedding()

# Stack the embeddings : Combine multiple embeddings into a powerful word representation model without much complexity
stacked_embeddings = StackedEmbeddings( embeddings = [ 
                                                       flair_forward, 
                                                       flair_backward
                                                      ])

2020-05-20 15:58:44,732 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-1024-v0.2rc.pt not found in cache, downloading to /tmp/tmpu24x78y5


100%|██████████| 19689779/19689779 [00:02<00:00, 9781192.76B/s]

2020-05-20 15:58:47,403 copying /tmp/tmpu24x78y5 to cache at /root/.flair/embeddings/lm-news-english-backward-1024-v0.2rc.pt





2020-05-20 15:58:47,430 removing temp file /tmp/tmpu24x78y5


## Test stacked embeddings



In [17]:
# Create a sentence
sentence = Sentence('These blogs are awesome.')
# Embed words in the sentence
stacked_embeddings.embed(sentence)
for token in sentence:
  print(token.embedding)
# Print type and size of the embedding
print(type(token.embedding))
print(token.embedding.size()[0])

tensor([ 3.8988e-03, -3.7598e-05, -3.4552e-03,  ...,  1.1473e-09,
        -7.2948e-07,  5.2823e-02])
tensor([-4.9989e-02, -6.4013e-05,  6.3951e-03,  ...,  3.6644e-09,
         4.5787e-07, -6.5868e-02])
tensor([-4.9550e-03, -1.2693e-04,  2.1048e-02,  ...,  1.2971e-07,
         4.0299e-07, -1.2174e-01])
tensor([-7.3077e-05, -5.1575e-06,  2.3907e-02,  ...,  2.4966e-08,
         6.5300e-09, -1.1079e-01])
<class 'torch.Tensor'>
2048


## Vectorizing the text
We will be using two approaches for vectorizing the text.

## Mean of Word Embeddings within a Tweet
In this approach, for each sentence we do the following:
1. Generate word embeddings for each word
2. Calculate mean of the embeddings for each word to get embedding of the sentence

In [0]:
from tqdm import tqdm # Progress bar to track progress
# Create a tensor for storing the sentence embeddings
s = torch.zeros(0, z)

# Iterate over each sentence
for tweet in tqdm(txt):
  # Empty tensor for words
  w = torch.zeros(0, z)
  sentence = Sentence(tweet)
  stacked_embeddings.embed(sentence)
  # Iterate over each word
  for token in sentence:
    # Store the embeddings of each word in a sentence
    w = torch.cat((w, token.embedding.view(-1, z)), 0)
    # Store the embeddings of each sentence ie. all words
    s = torch.cat((s, w.mean(dim = 0).view(-1, z)), 0)

  1%|          | 514/49159 [02:01<4:54:25,  2.75it/s]