## Data Preparation and Word Embeddings

In [218]:
import numpy as np
import pandas as pd
from gensim.models import FastText
import nltk
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score

In [220]:
#Source: https://www.kaggle.com/crowdflower/twitter-airline-sentiment
df_tweets = pd.read_csv("Dataset/twitter-airline-sentiment/Tweets.csv")

### Clean Data

In [221]:
#Shuffle rows
df_tweets = df_tweets.sample(frac=1, random_state=1).reset_index(drop=True)

# Remove Tags
df_tweets["text"] = df_tweets['text'].str.replace('((@|#)\w+)','') 

# Remove punctuation 
df_tweets["text"] = df_tweets['text'].str.replace('[^\w\s]','') 

# Remove numbers
df_tweets["text"] = df_tweets['text'].str.replace('[^\D]','') 

# Add categorical number column
df_tweets.airline_sentiment = pd.Categorical(df_tweets.airline_sentiment)
df_tweets['category'] = df_tweets.airline_sentiment.cat.codes

df_tweets.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,category
0,568198336651649027,positive,1.0,,,Delta,,GenuineJack,,0,Ill pass along the advice You guys rock,,2015-02-18 16:00:14 -0800,Massachusetts,Central Time (US & Canada),2
1,568438094652956673,negative,0.7036,Lost Luggage,0.7036,United,,vina_love,,0,I sent you a dm with my file reference number...,,2015-02-19 07:52:57 -0800,ny,Quito,0
2,567858373527470080,positive,1.0,,,Southwest,,Capt_Smirk,,0,Black History Commercial is really sweet Well...,,2015-02-17 17:29:21 -0800,La Florida,Eastern Time (US & Canada),2
3,569336871853170688,negative,1.0,Late Flight,1.0,Southwest,,scoobydoo9749,,0,why am I still in Baltimore is doing laps ar...,"[39.1848041, -76.6787131]",2015-02-21 19:24:22 -0800,"Tallahassee, FL",America/Chicago,0
4,568839199773732864,positive,0.6832,,,Southwest,,laurafall,,0,SEA to DEN South Sound Volleyball team on its...,,2015-02-20 10:26:48 -0800,,Pacific Time (US & Canada),2


### Tokenize data

In [222]:
all_tweets = sum([[nltk.word_tokenize(tok_tweet) for tok_tweet in nltk.sent_tokenize(tweet)] for tweet in df_tweets.text.str.lower()], [])

In [223]:
#Read some tweets
for tweet in all_tweets[0:10]:
    print(tweet)

['ill', 'pass', 'along', 'the', 'advice', 'you', 'guys', 'rock']
['i', 'sent', 'you', 'a', 'dm', 'with', 'my', 'file', 'reference', 'number', 'i', 'just', 'want', 'to', 'know', 'if', 'someone', 'has', 'located', 'my', 'bag', 'even', 'if', 'its', 'not', 'here', 'yet']
['black', 'history', 'commercial', 'is', 'really', 'sweet', 'well', 'done']
['why', 'am', 'i', 'still', 'in', 'baltimore', 'is', 'doing', 'laps', 'around', 'us', 'and', 'laughing', 'about', 'it', 'ridiculous']
['sea', 'to', 'den', 'south', 'sound', 'volleyball', 'team', 'on', 'its', 'way', 'httptcotncxcldm']
['one', 'of', 'your', 'workers', 'refused', 'to', 'give', 'me', 'her', 'name', 'as', 'a', 'reference', 'for', 'my', 'notes', 'her', 'tone', 'amp', 'language', 'was', 'very', 'unprofessional']
['seats', 'that', 'were', 'assigned', 'are', 'inappropriate', 'for', 'child', 'this', 'age', 'aa', 'knew', 'age', 'of', 'child']
['now', 'you', 'change', 'my', 'gate', 'and', 'dont', 'tell', 'me', 'what', 'the', 'fuck', 'is', 'wro

In [224]:
df_tweets.airline_sentiment.value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

### Create Word Embeddings

In [225]:
word_model = FastText(min_count=3, window=8, workers=12)
word_model.build_vocab(all_tweets)

In [226]:
word_model.train(all_tweets, total_examples=word_model.corpus_count, epochs=25)

In [227]:
word_model.save('tweets.wv.fasttext')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## Demo vanilla SVM

In [232]:
# Helper functions
#Returns a "sentence" vector which is the sum of all word vectors in the sentence. Arg. sentence is a list of words in the sentence
def sentence_to_embedding(sentence, a=1000):
    embeddings = []
    for w in sentence:
        try:
            # freq is number of occurences in vocab
            freq = word_model.wv.vocab[w].count if w in word_model.wv.vocab else 0 
            # Get the entity’s representations in vector space, as a 1D numpy array, some normalizing and then append to embeddings
            embeddings.append(word_model.wv.get_vector(w)*a/(a+freq))
        except:
            pass
    if len(embeddings) == 0:
        return np.zeros([word_model.wv.vector_size])
    return np.sum(embeddings, axis=0)


def TransformSentence(sentence):
    
    tokens = np.asarray([nltk.word_tokenize(tok_sent) for tok_sent in nltk.sent_tokenize(sentence.lower())]).flatten()
    output_len = tokens.shape[0]
    sent_embeddings = np.zeros([word_model.wv.vector_size])
    
    for i, token in enumerate(tokens):
            try:
                sent_embeddings = np.add(sent_embeddings, word_model.wv.get_vector(token))
            except:
                pass
            
    sent_embeddings = sentence_to_embedding(tokens)
    
    return sent_embeddings

def TransformFeatures(sentences):
    """
    param: np array of sentences
    return: np array (
    """
    sentences_trans = np.array(list(map(TransformSentence, sentences)))
    
    return sentences_trans

def TransformData(x_train, x_test):
    """
    param: np arrays of text
    return: np arrays of numbers
    """
    
    x_train_trans = TransformFeatures(x_train).reshape((x_train.shape[0], word_model.wv.vector_size))
    x_test_trans = TransformFeatures(x_test).reshape((x_test.shape[0], word_model.wv.vector_size))
    
    
    return x_train_trans, x_test_trans

In [75]:
word_model = FastText.load('tweets.wv.fasttext')

### Transform dataset

In [229]:
x_train, x_test, y_train, y_test = train_test_split(
    df_tweets.text.values, df_tweets.category.values, test_size=0.2, random_state=1)

In [230]:
# Testing stuff
y_train

array([0, 2, 0, ..., 0, 0, 1], dtype=int8)

In [233]:
# Transform to vectors
x_train_trans, x_test_trans, = TransformData(x_train, x_test)

### Build and train model

In [234]:
clf = svm.SVC()
clf.fit(x_train_trans,y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

## Evaluate

In [235]:
print(clf.score(x_test_trans,y_test))
#print(f1_score(x_test_trans,y_test))

0.6960382513661202


### Tests - Simon