## Data Preparation and Word Embeddings
#### Hannes Kindbom

In [21]:
import numpy as np
import pandas as pd
from gensim.models import FastText
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile, common_texts
import nltk
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [81]:
#Source: https://www.kaggle.com/crowdflower/twitter-airline-sentiment
df_tweets = pd.read_csv("Dataset/twitter-airline-sentiment/Tweets.csv")

### Clean Data

In [82]:
#Shuffle rows
#df_tweets = df_tweets.sample(frac=1, random_state=1).reset_index(drop=True)

# Remove Tags
df_tweets["text"] = df_tweets['text'].str.replace('((@|#)\w+)','') 

# Remove punctuation 
df_tweets["text"] = df_tweets['text'].str.replace('[^\w\s]','') 

# Remove numbers
df_tweets["text"] = df_tweets['text'].str.replace('[^\D]','') 

# Add categorical number column
df_tweets.airline_sentiment = pd.Categorical(df_tweets.airline_sentiment)
df_tweets['category'] = df_tweets.airline_sentiment.cat.codes

df_tweets.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,category
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,What said,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada),1
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,plus youve added commercials to the experienc...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada),2
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,I didnt today Must mean I need to take anothe...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada),1
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,its really aggressive to blast obnoxious ente...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada),0
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,and its a really big bad thing about it,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada),0


### Tokenize data

In [83]:
all_tweets = sum([[nltk.word_tokenize(tok_tweet) for tok_tweet in nltk.sent_tokenize(tweet)] for tweet in df_tweets.text.str.lower()], [])

In [84]:
df_tweets.airline_sentiment.value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

### Create Bag of Words

In [85]:
def BoW(ngram_range, min_df, x_all, x_train, x_test):
    # create a count vectorizer object (BOW-object), min_df removes infrequent words
    count_vect = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',min_df=min_df, ngram_range=ngram_range)
    count_vect.fit(x_all)
    # transform the training and test data using count vectorizer object
    xtrain_count =  count_vect.transform(x_train)
    xtest_count =  count_vect.transform(x_test)
    return xtrain_count, xtest_count

### Create Document Embeddings

In [6]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(all_tweets)]
doc_model = Doc2Vec(documents, vector_size=100, epochs=40, window=8, min_count=3, workers=12)

In [255]:
doc_model.save(get_tmpfile("tweets_doc2vec_model"))

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


### Create Word Embeddings

In [7]:
word_model = FastText(min_count=3, window=8, workers=12)
word_model.build_vocab(all_tweets)

In [8]:
word_model.train(all_tweets, total_examples=word_model.corpus_count, epochs=200)

In [9]:
word_model.save('tweets.wv.fasttext')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## Demo vanilla SVM

In [10]:
# Helper functions
#Returns a "sentence" vector which is the sum of all word vectors in the sentence. Arg. sentence is a list of words in the sentence
def sentence_to_embedding(sentence, a=1000):
    embeddings = []
    for w in sentence:
        try:
            # freq is number of occurences in vocab
            freq = word_model.wv.vocab[w].count if w in word_model.wv.vocab else 0 
            # Get the entity’s representations in vector space, as a 1D numpy array, some normalizing and then append to embeddings
            embeddings.append(word_model.wv.get_vector(w)*a/(a+freq))
        except:
            pass
    if len(embeddings) == 0:
        return np.zeros([word_model.wv.vector_size])
    return np.sum(embeddings, axis=0)


def TransformSentence(sentence):
    
    tokens = np.asarray([nltk.word_tokenize(tok_sent) for tok_sent in nltk.sent_tokenize(sentence.lower())]).flatten()
    output_len = tokens.shape[0]
    sent_embeddings = np.zeros([word_model.wv.vector_size])
    
    for i, token in enumerate(tokens):
            try:
                sent_embeddings = np.add(sent_embeddings, word_model.wv.get_vector(token))
            except:
                pass
            
    sent_embeddings = sentence_to_embedding(tokens)
    
    return sent_embeddings

def TransformFeatures(sentences):
    """
    param: np array of sentences
    return: np array (
    """
    sentences_trans = np.array(list(map(TransformSentence, sentences)))
    
    return sentences_trans

def TransformDataFastText(x_train, x_test):
    """
    param: np arrays of text
    return: np arrays of numbers
    """
    
    x_train_trans = TransformFeatures(x_train).reshape((x_train.shape[0], word_model.wv.vector_size))
    x_test_trans = TransformFeatures(x_test).reshape((x_test.shape[0], word_model.wv.vector_size))
    
    return x_train_trans, x_test_trans

def get_doc_vec(sentence):
    tokens = [nltk.word_tokenize(tok_sent) for tok_sent in nltk.sent_tokenize(sentence.lower())][0]
    doc_vec = doc_model.infer_vector(tokens, steps=40, alpha=0.025)
    return doc_vec

def TransformDataDoc2Vec(x_test):
   
    #x_train_trans_D2V = np.array(list(map(get_doc_vec, x_train)))
    x_test_trans = np.array(list(map(get_doc_vec, x_test)))
    
    return x_test_trans

In [11]:
word_model = FastText.load('tweets.wv.fasttext')
doc_model = Doc2Vec.load(get_tmpfile("tweets_doc2vec_model"))
doc_model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

### Transform dataset

In [86]:
x_train, x_test, y_train, y_test = train_test_split(
    df_tweets.text.values, df_tweets.category.values, test_size=0.2, random_state=1)

In [13]:
# Transform to sentence vectors with fasttext
x_train_trans_FT, x_test_trans_FT = TransformDataFastText(x_train, x_test)

In [576]:
print(documents[45])
print(df_tweets.text.values[45])

# Transform to sentence vectors with doc2vec
x_test_D2V = df_tweets.text.values[10000:14000]
x_test_trans_D2V = TransformDataDoc2Vec(x_test_D2V)
x_train_trans_D2V = np.array([doc_model.docvecs[doc.tags[0]] for doc in documents[0:10000]])
y_train_D2V = df_tweets.category.values[0:10000]
y_test_D2V = df_tweets.category.values[10000:14000]


TaggedDocument(['you', 'make', 'it', 'hard', 'to', 'fly', 'with', 'you', 'delayed', 'over', 'an', 'hour', 'and', 'now', 'the', 'plane', 'is', 'turning', 'around', 'amp', 'heading', 'back', 'to', 'the', 'gate'], [45])
 You make it hard to fly with you Delayed over an hour and now the plane  is turning around amp heading back to the gate 


In [593]:
print(x_test_trans_D2V.shape)
print(x_train_trans_D2V.shape)
print(y_test_D2V.shape)
print(y_train_D2V.shape)

(4000, 100)
(10000, 100)
(4000,)
(10000,)


In [686]:
#Debugging
tokens = [nltk.word_tokenize(tok_sent) for tok_sent in nltk.sent_tokenize(df_tweets.text.values[10241].lower())][0]
doc_vec = doc_model.infer_vector(tokens, steps=100, alpha=0.025)

a = doc_vec
b = doc_model.docvecs[10011]
dot = np.dot(a, b)
norma = np.linalg.norm(a)
normb = np.linalg.norm(b)
cos = dot / (norma * normb)
print(cos)

0.67211723


### Build and train model

In [87]:
r = 4233
print(df_tweets["text"][r])
df_tweets["airline_sentiment"][r]

 It is super frustrating that the folks at the United Ticket Counter in Pittsburg arent honoring their own Media Rate 


'negative'

In [88]:
x_train_bow, x_test_bow = BoW((1,3), 5, df_tweets["text"], x_train, x_test)

In [89]:
# BoW SVM
clf_bow = svm.SVC()
clf_bow.fit(x_train_bow, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [14]:
# FastText SVM
clf_FT = svm.SVC()
clf_FT.fit(x_train_trans_FT,y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [584]:
# Doc2Vec SVM
clf_D2V = svm.SVC()
clf_D2V.fit(x_train_trans_D2V,y_train_D2V)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

## Evaluate

In [90]:
#print("Accuracy Doc2Vec SVM: ", clf_D2V.score(x_test_trans_D2V,y_test_D2V))
#print("Accuracy FastText SVM: ", clf_FT.score(x_test_trans_FT,y_test))
print("Accuracy BoW SVM: ", clf_bow.score(x_test_bow,y_test))

Accuracy BoW SVM:  0.6236338797814208


In [80]:
# BoW SVM
y_pred_bow = clf_bow.predict(x_test_bow)
print("confusion matrix FastText: \n", confusion_matrix(y_test, y_pred_bow))
print(classification_report(y_test, y_pred_bow))

confusion matrix FastText: 
 [[1854    0    0]
 [ 589    0    0]
 [ 485    0    0]]
              precision    recall  f1-score   support

           0       0.63      1.00      0.78      1854
           1       0.00      0.00      0.00       589
           2       0.00      0.00      0.00       485

   micro avg       0.63      0.63      0.63      2928
   macro avg       0.21      0.33      0.26      2928
weighted avg       0.40      0.63      0.49      2928



In [34]:
# FastText SVM
y_pred_FT = clf_FT.predict(x_test_trans_FT)
print("confusion matrix FastText: \n", confusion_matrix(y_test, y_pred_FT))
print(classification_report(y_test, y_pred_FT))

confusion matrix FastText: 
 [[1852    2    0]
 [ 541   40    8]
 [ 431    7   47]]
              precision    recall  f1-score   support

           0       0.66      1.00      0.79      1854
           1       0.82      0.07      0.13       589
           2       0.85      0.10      0.17       485

   micro avg       0.66      0.66      0.66      2928
   macro avg       0.78      0.39      0.36      2928
weighted avg       0.72      0.66      0.56      2928



In [595]:
# Doc2Vec SVM
y_pred_D2V = clf_D2V.predict(x_test_trans_D2V)
print("confusion matrix Doc2Vec: \n", confusion_matrix(y_test_D2V,y_pred_D2V))
print(classification_report(y_test_D2V, y_pred_D2V))

confusion matrix Doc2Vec: 
 [[2485    0    0]
 [ 849    0    0]
 [ 666    0    0]]
              precision    recall  f1-score   support

           0       0.62      1.00      0.77      2485
           1       0.00      0.00      0.00       849
           2       0.00      0.00      0.00       666

   micro avg       0.62      0.62      0.62      4000
   macro avg       0.21      0.33      0.26      4000
weighted avg       0.39      0.62      0.48      4000



  'precision', 'predicted', average, warn_for)


### Tests - Simon