## Data Preparation and Word Embeddings
#### Hannes Kindbom

In [1]:
import numpy as np
import pandas as pd
from gensim.models import FastText
import nltk
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [3]:
#Source: https://www.kaggle.com/crowdflower/twitter-airline-sentiment
df_tweets = pd.read_csv("Dataset/twitter-airline-sentiment/Tweets.csv")

### Clean Data

In [4]:
#Shuffle rows
df_tweets = df_tweets.sample(frac=1, random_state=1).reset_index(drop=True)

# Remove Tags
#df_tweets["text"] = df_tweets['text'].str.replace('((@|#)\w+)','') 

# Remove punctuation 
df_tweets["text"] = df_tweets['text'].str.replace('[^\w\s]','') 

# Remove numbers
df_tweets["text"] = df_tweets['text'].str.replace('[^\D]','') 

df_tweets.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,568198336651649027,positive,1.0,,,Delta,,GenuineJack,,0,JetBlue Ill pass along the advice You guys rock,,2015-02-18 16:00:14 -0800,Massachusetts,Central Time (US & Canada)
1,568438094652956673,negative,0.7036,Lost Luggage,0.7036,United,,vina_love,,0,united I sent you a dm with my file reference ...,,2015-02-19 07:52:57 -0800,ny,Quito
2,567858373527470080,positive,1.0,,,Southwest,,Capt_Smirk,,0,SouthwestAir Black History Commercial is reall...,,2015-02-17 17:29:21 -0800,La Florida,Eastern Time (US & Canada)
3,569336871853170688,negative,1.0,Late Flight,1.0,Southwest,,scoobydoo9749,,0,SouthwestAir why am I still in Baltimore delta...,"[39.1848041, -76.6787131]",2015-02-21 19:24:22 -0800,"Tallahassee, FL",America/Chicago
4,568839199773732864,positive,0.6832,,,Southwest,,laurafall,,0,SouthwestAir SEA to DEN South Sound Volleyball...,,2015-02-20 10:26:48 -0800,,Pacific Time (US & Canada)


### Tokenize data

In [None]:
all_tweets = sum([[nltk.word_tokenize(tok_tweet) for tok_tweet in nltk.sent_tokenize(tweet)] for tweet in df_tweets.text.str.lower()], [])

In [5]:
df_tweets.airline_sentiment.value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

### Create Bag of Words

In [10]:
def BoW(ngram_range, min_df, x_all, x_train, x_test):
    count_vect = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',min_df=min_df, ngram_range=ngram_range)
    count_vect.fit(x_all)

    xtrain_count =  count_vect.transform(x_train)
    xtest_count =  count_vect.transform(x_test)
    return xtrain_count, xtest_count

### Create Word Embeddings

In [None]:
word_model = FastText(min_count=3, window=8, workers=12)
word_model.build_vocab(all_tweets)

In [None]:
word_model.train(all_tweets, total_examples=word_model.corpus_count, epochs=100)

In [None]:
word_model.save('tweets.wv.fasttext')

### Encoding data

In [35]:
# Helper functions
#Returns a "sentence" vector which is the sum of all word vectors in the sentence. Arg. sentence is a list of words in the sentence
def sentence_to_embedding(sentence, a=1000):
    embeddings = []
    for w in sentence:
        try:
            # freq is number of occurences in vocab
            freq = word_model.wv.vocab[w].count if w in word_model.wv.vocab else 0 
            # Get the entity’s representations in vector space, as a 1D numpy array, some normalizing and then append to embeddings
            embeddings.append(word_model.wv.get_vector(w)*a/(a+freq))
        except:
            pass
    if len(embeddings) == 0:
        return np.zeros([word_model.wv.vector_size])
    return np.sum(embeddings, axis=0)


def TransformSentence(sentence):
    
    tokens = np.asarray([nltk.word_tokenize(tok_sent) for tok_sent in nltk.sent_tokenize(sentence.lower())]).flatten()
    output_len = tokens.shape[0]
    sent_embeddings = np.zeros([word_model.wv.vector_size])
    
    for i, token in enumerate(tokens):
            try:
                sent_embeddings = np.add(sent_embeddings, word_model.wv.get_vector(token))
            except:
                pass
            
    sent_embeddings = sentence_to_embedding(tokens)
    
    return sent_embeddings

def TransformFeatures(sentences):
 
    sentences_trans = np.array(list(map(TransformSentence, sentences)))
    
    return sentences_trans

def TransformDataFastText(x_train, x_test):
    
    x_train_trans = TransformFeatures(x_train).reshape((x_train.shape[0], word_model.wv.vector_size))
    x_test_trans = TransformFeatures(x_test).reshape((x_test.shape[0], word_model.wv.vector_size))
    
    return x_train_trans, x_test_trans

### Transform dataset

In [6]:
x_train, x_test, y_train, y_test = train_test_split(
    df_tweets.text.values, df_tweets.airline_sentiment.values, 
    stratify= df_tweets.airline_sentiment.values, test_size=0.2, random_state=2)

In [7]:
word_model = FastText.load('tweets.wv.fasttext')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [8]:
# Transform to sentence vectors with fasttext
x_train_trans_FT, x_test_trans_FT = TransformDataFastText(x_train, x_test)

In [11]:
x_train_bow, x_test_bow = BoW((1,3), 5, df_tweets["text"], x_train, x_test)

### Save wordembeddings

In [36]:
np.save('Embeddings/x_train_bow', x_train_bow) 
np.save('Embeddings/x_test_bow', x_test_bow) 
np.save('Embeddings/x_train_trans_FT', x_train_trans_FT) 
np.save('Embeddings/x_test_trans_FT', x_test_trans_FT)

np.save('Embeddings/y_train', y_train)
np.save('Embeddings/y_test', y_test)

## Demo vanilla SVM (Build, train and evaluate models)

In [37]:
# Load saved embeddings
x_train_bow = np.load('Embeddings/x_train_bow.npy').item().toarray()
x_test_bow = np.load('Embeddings/x_test_bow.npy').item().toarray()
 

In [30]:
def evaluate(clf, clf_name, x_test, y_test):
    print("Accuracy " + clf_name + ": ", clf.score(x_test, y_test))
    
    y_pred = cross_val_predict(clf, x_test, y_test, cv=5) #clf.predict(x_test)
    print("confusion matrix "+ clf_name +": \n" , confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
def trainAndEvaluate(clf, x_train, y_train, x_test, y_test, clf_name):
    clf.fit(x_train, y_train)
    evaluate(clf, clf_name, x_test, y_test)

In [31]:
clf_svm = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')

In [32]:
#BoW
trainAndEvaluate(clf_svm, x_train_bow, y_train, x_test_bow, y_test, "BOW + SVM")

Accuracy BOW + SVM:  0.7786885245901639
confusion matrix BOW + SVM: 
 [[1574  191   71]
 [ 214  355   51]
 [ 108   87  277]]
              precision    recall  f1-score   support

    negative       0.83      0.86      0.84      1836
     neutral       0.56      0.57      0.57       620
    positive       0.69      0.59      0.64       472

   micro avg       0.75      0.75      0.75      2928
   macro avg       0.70      0.67      0.68      2928
weighted avg       0.75      0.75      0.75      2928



In [33]:
# FastText 
trainAndEvaluate(clf_svm, x_train_trans_FT, y_train, x_test_trans_FT, y_test, "FT")

Accuracy FT:  0.7855191256830601
confusion matrix FT: 
 [[1617  157   62]
 [ 234  326   60]
 [ 126   65  281]]
              precision    recall  f1-score   support

    negative       0.82      0.88      0.85      1836
     neutral       0.59      0.53      0.56       620
    positive       0.70      0.60      0.64       472

   micro avg       0.76      0.76      0.76      2928
   macro avg       0.70      0.67      0.68      2928
weighted avg       0.75      0.76      0.75      2928

