## Data preprocess

data source: https://www.kaggle.com/crowdflower/twitter-airline-sentiment

In [1]:
import pandas as pd

twt = pd.read_csv('Twitter_Airline_Sentiment.csv')
twt.shape

(14640, 15)

In [8]:
twt.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [9]:
# paragraph Id
twt['pid'] = 'id_' + twt.index.astype(str)
twt[['text', 'airline_sentiment', 'pid']].head()

Unnamed: 0,text,airline_sentiment,pid
0,@VirginAmerica What @dhepburn said.,neutral,id_0
1,@VirginAmerica plus you've added commercials t...,positive,id_1
2,@VirginAmerica I didn't today... Must mean I n...,neutral,id_2
3,@VirginAmerica it's really aggressive to blast...,negative,id_3
4,@VirginAmerica and it's a really big bad thing...,negative,id_4


In [10]:
twt['text'].sample(10).tolist()

["@AmericanAir Can't unload flight #3322 because jetway is broken.  #steps #planB? #waiting nearly an hour",
 '@united This must be a drone “@united: @KeamBleam We understand your frustration. Our Bag team is working hard to get your bag(s) to you..."',
 '@united employees almost seem happy when delivery terrible customer service.',
 '@united really, fill out a form about my flight experience? I sent an email to the 1K email address.',
 '@USAirways: I experienced what defines customer service on #FLT1999. A flight attendant willing to follow up with a passenger on bag charges',
 '@united you all do a wonderful job today. Got my wife, daughter, and myself from PGH to Orlando after out flight was delayed luggage and all',
 '@JetBlue does not fit in 140',
 '@united I just booked a flight for (2). When I view my reservation it has MI connected to First name. Is this a problem? can it be changed?',
 "@United Airlines' CEO Jeff Smisek: Disloyal to Loyal Workers http://t.co/0cevY3P42b via @Hu

In [8]:
twt['airline_sentiment'].value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html   
https://towardsdatascience.com/multi-class-text-classification-with-doc2vec-logistic-regression-9da9947b43f4

In [11]:
import re

def cleanText(text):
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\S+', r'<URL>', text)
    text = re.sub('[\#\?\,.\!\:\-\/\"]', '', text)
    text = text.lower()
    return text

abc = twt['text'].sample(20)
abc.apply(cleanText)

7375      last flight was cancelled flightled then it w...
10056     overloads small plane with extra baggage and ...
4869      wifi is so slow it totally precludes working ...
2315      u guys did it again changed ages and double b...
14574     i dmed you my aa &amp; phone s &amp; you can'...
6261      now it's delayed until 355 getting yelled at ...
3582      then watched my connecting flight in den pull...
13496     wasn't just a delay your counter wouldn't tak...
7598      well the last update was in the right directi...
3070      huge kudos to the fo of sunday's flt 1623 sjo...
12155     this delayed bag was for my friend lisa pafe ...
7896          fliers to gain access to wsj content   <url>
7094                   x__x rt  our fleet's on fleek <url>
8475      works with google chrome but not internet exp...
10969     i rebooked myself  but cancelled flighting fl...
14421     flt cancelled flighted rescheduled to bad tim...
14588     if business class if full but 1st class empty.

In [13]:
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from gensim.models.doc2vec import TaggedDocument

df = twt[['text', 'airline_sentiment', 'pid']].copy()
df['text'] = df['text'].apply(cleanText)

train, test = train_test_split(df, test_size=0.3, random_state=42)
print("train/test size:")
print(len(train), len(test))

def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

train_tagged = train.apply(lambda r: TaggedDocument(words=tokenize_text(r['text']), tags=[r.pid]), axis=1)
test_tagged = test.apply(lambda r: TaggedDocument(words=tokenize_text(r['text']), tags=[r.pid]), axis=1)

train/test size:
10248 4392


In [15]:
train_tagged.values[130]

TaggedDocument(words=['exicted', 'to', 'be', 'flying', 'with', "y'all", 'soon'], tags=['id_10990'])

In [14]:
train_tagged.values[230]

TaggedDocument(words=['new', 'fas', 'from', 'dfw', 'to', 'clt', 'this', 'morning', 'did', 'great', 'job', 'well', 'done'], tags=['id_9997'])

In [16]:
import multiprocessing

cores = multiprocessing.cpu_count()
cores

4

## Doc2vec model

In [17]:
from gensim.models import Doc2Vec
from tqdm import tqdm

# If dm=0, distributed bag of words (PV-DBOW) is used; if dm=1,‘distributed memory’ (PV-DM) is used.
# vector_size=100, dimension of feature vectors
# negative=5, specifies how many “noise words” should be drawn
# hs=0, and negative is non-zero, negative sampling will be used. hs=1, hierarchical softmax will be used
# min_count=2, ignores all words with total frequency lower than this.
# workers=cores, use these many worker threads to train the model

# build_vocab: Build vocabulary from a sequence of documents

model_dbow = Doc2Vec(dm=0, vector_size=100, negative=5, hs=0, min_count=2, sample=0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 10248/10248 [00:00<00:00, 2830823.72it/s]


In [18]:
%%time
from sklearn import utils

for epoch in range(5):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.01
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 10248/10248 [00:00<00:00, 2399420.98it/s]
100%|██████████| 10248/10248 [00:00<00:00, 3077264.27it/s]
100%|██████████| 10248/10248 [00:00<00:00, 3411367.25it/s]
100%|██████████| 10248/10248 [00:00<00:00, 3242304.25it/s]
100%|██████████| 10248/10248 [00:00<00:00, 3411367.25it/s]


CPU times: user 1.86 s, sys: 292 ms, total: 2.15 s
Wall time: 1.34 s


In [27]:
model_dbow.docvecs['id_9997']

array([ 3.7849168e-03, -3.5976903e-03,  2.6366175e-03,  2.0197590e-03,
        4.9026613e-03, -4.0240064e-03,  4.1246125e-03, -1.3683724e-03,
       -4.4880062e-03, -3.7045605e-03,  1.8961310e-03, -3.1398521e-03,
        3.7168742e-03,  2.1768378e-03,  4.7869845e-03, -5.9758913e-04,
        3.0824784e-03, -4.7984663e-03,  3.3126618e-03,  3.0297304e-03,
        9.2157198e-04,  3.6062519e-03,  4.0312381e-03,  3.2966593e-03,
       -1.8666973e-03, -2.6029947e-03, -1.3738588e-03,  3.0936552e-03,
       -1.5778954e-03,  1.8296989e-03, -3.5384735e-03, -1.6363992e-03,
        5.9242896e-04,  3.4157357e-03,  1.0186807e-03,  9.3581708e-04,
        1.7825345e-03, -9.4159669e-04,  1.8399979e-03,  3.0631749e-03,
       -6.4379035e-04,  2.7704926e-03, -4.2571332e-03,  7.0344936e-04,
        1.5287144e-03, -3.4877951e-03, -1.8509105e-03, -4.7731753e-03,
        3.7861951e-03, -5.2917912e-04, -2.4302639e-03,  3.6517745e-03,
       -3.3845026e-03,  1.2606429e-04, -1.3725236e-03, -2.1271575e-03,
      

In [58]:
new_vector = model_dbow.docvecs['id_11351']
sims = model_dbow.docvecs.most_similar([new_vector], topn=5)
print(sims)
print()

for i in range(3):
    idx = int(sims[i][0].split("_")[1])
    print(train_tagged[idx])
    print()

[('id_11351', 1.0), ('id_5507', 0.3673630654811859), ('id_13339', 0.3532582223415375), ('id_679', 0.34670791029930115), ('id_14201', 0.3388338088989258)]

TaggedDocument(['marsha', 'at', 'myrtle', 'beach', 'is', 'the', 'greatest', 'she', 'deserves', 'all', 'the', 'respect', 'and', 'praise', 'there', 'is', 'thankyouforeverything'], ['id_11351'])

TaggedDocument(['so', 'the', 'upcoming', 'rr', 'changesdeval', 'is', 'trying', 'to', 'tell', 'me', 'not', 'to', 'fly', 'southwest', 'anymore', 'because', 'am', 'loyal', 'so', 'far', 'get', 'it'], ['id_5507'])

TaggedDocument(['thank', 'you'], ['id_13339'])



In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

def vec_for_learning(model, tagged_docs, data):
    sents = tagged_docs.values
    
    # infer_vector: A document for which the vector representation will be inferred.
    # Number of times to train the new document. 
    
    regressors = [model.infer_vector(doc.words, epochs=20) for doc in sents]
    targets = data['airline_sentiment'].tolist()
    return targets, regressors

y_train, X_train = vec_for_learning(model_dbow, train_tagged, train)
y_test, X_test = vec_for_learning(model_dbow, test_tagged, test)

logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))



Testing accuracy 0.6723588342440802
Testing F1 score: 0.6212614208818695


## Model paring

In [20]:
%%time

# If dm=0, distributed bag of words (PV-DBOW) is used; if dm=1,‘distributed memory’ (PV-DM) is used.
# dm_mean, If 0, use the sum of the context word vectors. If 1, use the mean.
# window, the maximum distance between the current and predicted word within a sentence
# alpha – The initial learning rate.
# min_alpha – Learning rate will linearly drop to min_alpha as training progresses.

model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=100, window=10, negative=5, min_count=1, workers=cores, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])

for epoch in range(5):
    model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dmm.alpha -= 0.01
    model_dmm.min_alpha = model_dmm.alpha

100%|██████████| 10248/10248 [00:00<00:00, 1762329.95it/s]
100%|██████████| 10248/10248 [00:00<00:00, 3007713.06it/s]
100%|██████████| 10248/10248 [00:00<00:00, 3113147.49it/s]
100%|██████████| 10248/10248 [00:00<00:00, 2337696.60it/s]
100%|██████████| 10248/10248 [00:00<00:00, 2977296.35it/s]
100%|██████████| 10248/10248 [00:00<00:00, 3110894.36it/s]


CPU times: user 3.39 s, sys: 1.5 s, total: 4.89 s
Wall time: 3.02 s


In [22]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec

# Concatenate PV-DBOW and PV-DM representations 
new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])

In [25]:
%%time

def get_vectors(model, tagged_docs, data):
    sents = tagged_docs.values
    regressors = [model.infer_vector(doc.words, epochs=20) for doc in sents]
    targets = data['airline_sentiment'].tolist()
    return targets, regressors

y_train, X_train = get_vectors(new_model, train_tagged, train)
y_test, X_test = get_vectors(new_model, test_tagged, test)

logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))



Testing accuracy 0.7484061930783242
Testing F1 score: 0.7374719680745588
CPU times: user 26.3 s, sys: 92.2 ms, total: 26.4 s
Wall time: 26.4 s
