## Data Preparation and Word Embeddings
#### Hannes Kindbom

In [58]:
import numpy as np
import pandas as pd
from gensim.models import FastText
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile, common_texts
import nltk
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [59]:
#Source: https://www.kaggle.com/crowdflower/twitter-airline-sentiment
df_tweets = pd.read_csv("Dataset/twitter-airline-sentiment/Tweets.csv")

### Clean Data

In [60]:
#Shuffle rows
df_tweets = df_tweets.sample(frac=1, random_state=1).reset_index(drop=True)

# Remove Tags
#df_tweets["text"] = df_tweets['text'].str.replace('((@|#)\w+)','') 

# Remove punctuation 
df_tweets["text"] = df_tweets['text'].str.replace('[^\w\s]','') 

# Remove numbers
df_tweets["text"] = df_tweets['text'].str.replace('[^\D]','') 

df_tweets.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,568198336651649027,positive,1.0,,,Delta,,GenuineJack,,0,JetBlue Ill pass along the advice You guys rock,,2015-02-18 16:00:14 -0800,Massachusetts,Central Time (US & Canada)
1,568438094652956673,negative,0.7036,Lost Luggage,0.7036,United,,vina_love,,0,united I sent you a dm with my file reference ...,,2015-02-19 07:52:57 -0800,ny,Quito
2,567858373527470080,positive,1.0,,,Southwest,,Capt_Smirk,,0,SouthwestAir Black History Commercial is reall...,,2015-02-17 17:29:21 -0800,La Florida,Eastern Time (US & Canada)
3,569336871853170688,negative,1.0,Late Flight,1.0,Southwest,,scoobydoo9749,,0,SouthwestAir why am I still in Baltimore delta...,"[39.1848041, -76.6787131]",2015-02-21 19:24:22 -0800,"Tallahassee, FL",America/Chicago
4,568839199773732864,positive,0.6832,,,Southwest,,laurafall,,0,SouthwestAir SEA to DEN South Sound Volleyball...,,2015-02-20 10:26:48 -0800,,Pacific Time (US & Canada)


### Tokenize data

In [61]:
all_tweets = sum([[nltk.word_tokenize(tok_tweet) for tok_tweet in nltk.sent_tokenize(tweet)] for tweet in df_tweets.text.str.lower()], [])

In [62]:
df_tweets.airline_sentiment.value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

### Create Bag of Words

In [6]:
def BoW(ngram_range, min_df, x_all, x_train, x_test):
    # create a count vectorizer object (BOW-object), min_df removes infrequent words
    count_vect = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',min_df=min_df, ngram_range=ngram_range)
    count_vect.fit(x_all)
    # transform the training and test data using count vectorizer object
    xtrain_count =  count_vect.transform(x_train)
    xtest_count =  count_vect.transform(x_test)
    return xtrain_count, xtest_count

### Create Document Embeddings

In [41]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(all_tweets)]
doc_model = Doc2Vec(documents, vector_size=100, epochs=40, window=8, min_count=3, workers=12)

In [46]:
doc_model.save(get_tmpfile("tweets_doc2vec_model"))

### Create Word Embeddings

In [63]:
word_model = FastText(min_count=3, window=8, workers=12)
word_model.build_vocab(all_tweets)

In [64]:
word_model.train(all_tweets, total_examples=word_model.corpus_count, epochs=40)

In [65]:
word_model.save('tweets.wv.fasttext')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## Demo vanilla SVM

In [45]:
# Helper functions
#Returns a "sentence" vector which is the sum of all word vectors in the sentence. Arg. sentence is a list of words in the sentence
def sentence_to_embedding(sentence, a=1000):
    embeddings = []
    for w in sentence:
        try:
            # freq is number of occurences in vocab
            freq = word_model.wv.vocab[w].count if w in word_model.wv.vocab else 0 
            # Get the entity’s representations in vector space, as a 1D numpy array, some normalizing and then append to embeddings
            embeddings.append(word_model.wv.get_vector(w)*a/(a+freq))
        except:
            pass
    if len(embeddings) == 0:
        return np.zeros([word_model.wv.vector_size])
    return np.sum(embeddings, axis=0)


def TransformSentence(sentence):
    
    tokens = np.asarray([nltk.word_tokenize(tok_sent) for tok_sent in nltk.sent_tokenize(sentence.lower())]).flatten()
    output_len = tokens.shape[0]
    sent_embeddings = np.zeros([word_model.wv.vector_size])
    
    for i, token in enumerate(tokens):
            try:
                sent_embeddings = np.add(sent_embeddings, word_model.wv.get_vector(token))
            except:
                pass
            
    sent_embeddings = sentence_to_embedding(tokens)
    
    return sent_embeddings

def TransformFeatures(sentences):
    """
    param: np array of sentences
    return: np array (
    """
    sentences_trans = np.array(list(map(TransformSentence, sentences)))
    
    return sentences_trans

def TransformDataFastText(x_train, x_test):
    """
    param: np arrays of text
    return: np arrays of numbers
    """
    
    x_train_trans = TransformFeatures(x_train).reshape((x_train.shape[0], word_model.wv.vector_size))
    x_test_trans = TransformFeatures(x_test).reshape((x_test.shape[0], word_model.wv.vector_size))
    
    return x_train_trans, x_test_trans

def get_doc_vec(sentence):
    tokens = [nltk.word_tokenize(tok_sent) for tok_sent in nltk.sent_tokenize(sentence.lower())][0]
    doc_vec = doc_model.infer_vector(tokens, steps=40, alpha=0.025)
    return doc_vec

def TransformDataDoc2Vec(x_test):
   
    #x_train_trans_D2V = np.array(list(map(get_doc_vec, x_train)))
    x_test_trans = np.array(list(map(get_doc_vec, x_test)))
    
    return x_test_trans

In [66]:
word_model = FastText.load('tweets.wv.fasttext')

In [None]:
doc_model = Doc2Vec.load(get_tmpfile("tweets_doc2vec_model"))
doc_model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

### Transform dataset

In [67]:
x_train, x_test, y_train, y_test = train_test_split(
    df_tweets.text.values, df_tweets.airline_sentiment.values, 
    stratify= df_tweets.airline_sentiment.values, test_size=0.2, random_state=2)

In [68]:
# Transform to sentence vectors with fasttext
x_train_trans_FT, x_test_trans_FT = TransformDataFastText(x_train, x_test)

In [51]:
print(documents[45])
print(df_tweets.text.values[45])

# Transform to sentence vectors with doc2vec
x_test_D2V = df_tweets.text.values[10000:14000]
x_test_trans_D2V = TransformDataDoc2Vec(x_test_D2V)
x_train_trans_D2V = np.array([doc_model.docvecs[doc.tags[0]] for doc in documents[0:10000]])
y_train_D2V = df_tweets.airline_sentiment.values[0:10000]
y_test_D2V = df_tweets.airline_sentiment.values[10000:14000]

TaggedDocument(['you', 'make', 'it', 'hard', 'to', 'fly', 'with', 'you', 'delayed', 'over', 'an', 'hour', 'and', 'now', 'the', 'plane', 'is', 'turning', 'around', 'amp', 'heading', 'back', 'to', 'the', 'gate'], [45])
 You make it hard to fly with you Delayed over an hour and now the plane  is turning around amp heading back to the gate 


In [686]:
#Debugging
tokens = [nltk.word_tokenize(tok_sent) for tok_sent in nltk.sent_tokenize(df_tweets.text.values[10241].lower())][0]
doc_vec = doc_model.infer_vector(tokens, steps=100, alpha=0.025)

a = doc_vec
b = doc_model.docvecs[10011]
dot = np.dot(a, b)
norma = np.linalg.norm(a)
normb = np.linalg.norm(b)
cos = dot / (norma * normb)
print(cos)

0.67211723


### Build and train model

In [69]:
def evaluate(clf, clf_name, x_test, y_test):
    print("Accuracy " + clf_name + ": ", clf.score(x_test, y_test))
    
    y_pred = clf.predict(x_test)
    print("confusion matrix "+ clf_name +": \n" , confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
def trainAndEvaluate(clf, x_train, y_train, x_test, y_test, clf_name):
    clf.fit(x_train, y_train)
    evaluate(clf, clf_name, x_test, y_test)
    
    return clf

In [None]:
x_train_bow, x_test_bow = BoW((1,3), 5, df_tweets["text"], x_train, x_test)

In [55]:
#BoW
clf_bow = RandomForestClassifier(random_state=0) #svm.SVC()
trainAndEvaluate(clf_bow, x_train_bow, y_train, x_test_bow, y_test, "BOW")



Accuracy BOW:  0.7421448087431693
confusion matrix BOW: 
 [[1726   85   25]
 [ 340  234   46]
 [ 192   67  213]]
              precision    recall  f1-score   support

    negative       0.76      0.94      0.84      1836
     neutral       0.61      0.38      0.47       620
    positive       0.75      0.45      0.56       472

   micro avg       0.74      0.74      0.74      2928
   macro avg       0.71      0.59      0.62      2928
weighted avg       0.73      0.74      0.72      2928



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [56]:
# FastText 
clf_FT = RandomForestClassifier(random_state=0)#svm.SVC()
trainAndEvaluate(clf_FT, x_train_trans_FT, y_train, x_test_trans_FT, y_test, "FT")



Accuracy FT:  0.7172131147540983
confusion matrix FT: 
 [[1705  105   26]
 [ 340  230   50]
 [ 232   75  165]]
              precision    recall  f1-score   support

    negative       0.75      0.93      0.83      1836
     neutral       0.56      0.37      0.45       620
    positive       0.68      0.35      0.46       472

   micro avg       0.72      0.72      0.72      2928
   macro avg       0.66      0.55      0.58      2928
weighted avg       0.70      0.72      0.69      2928



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [57]:
# Doc2Vec 
clf_D2V = RandomForestClassifier(random_state=0) #svm.SVC()
trainAndEvaluate(clf_D2V, x_train_trans_D2V, y_train_D2V, x_test_trans_D2V, y_test_D2V, "D2V")



Accuracy D2V:  0.27875
confusion matrix D2V: 
 [[ 326 2125   34]
 [  55  782   12]
 [  49  610    7]]
              precision    recall  f1-score   support

    negative       0.76      0.13      0.22      2485
     neutral       0.22      0.92      0.36       849
    positive       0.13      0.01      0.02       666

   micro avg       0.28      0.28      0.28      4000
   macro avg       0.37      0.35      0.20      4000
weighted avg       0.54      0.28      0.22      4000



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

### Tests - Simon