In [2]:

import pandas as pd
import random
import re
from bs4 import BeautifulSoup


In [3]:
from nltk.tokenize import WordPunctTokenizer

tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))

def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()


   

In [4]:
#create Dataframe & clean Tweets

#Labeled Data
df = pd.read_csv('Tweets.csv')
df = df[[u'airline_sentiment',u'text']]
df.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [6]:

df.loc[:,'text'] = df.loc[:,'text'].map(tweet_cleaner)

#unlabled Data
udf = pd.read_csv('Tweets_Unlabeled1.csv')
udf = udf[[u'text']]
udf.loc[:,'text'] = udf.loc[:,'text'].map(tweet_cleaner)

In [None]:
df.count()

In [69]:
df.head()

Unnamed: 0,airline_sentiment,text
0,neutral,what said
1,positive,plus you ve added commercials to the experienc...
2,neutral,i didn t today must mean i need to take anothe...
3,negative,it s really aggressive to blast obnoxious ente...
4,negative,and it s a really big bad thing about it


In [72]:
#convert Categorical text values of sentiment to integer categorical values 
def integer_converter(text):
    if str.lower(text) =="neutral":
        return 2
    elif str.lower(text) =="positive":
        return 4
    else: 
        return 0
    
df.loc[:,'airline_sentiment'] = df.loc[:,'airline_sentiment'] .map(integer_converter)

In [73]:
df.head()

Unnamed: 0,airline_sentiment,text
0,2,what said
1,4,plus you ve added commercials to the experienc...
2,2,i didn t today must mean i need to take anothe...
3,0,it s really aggressive to blast obnoxious ente...
4,0,and it s a really big bad thing about it


In [35]:
udf.tail()

Unnamed: 0,text
58812,Arrive flyLAXairport to find precheck closed a...
58813,AA438 AmericanAir tells CBSDFW the flight retu...
58814,AmericanAir gate representatives in Dallas who...
58815,Seattle travelers will feel safe in hotels ton...
58816,FlyFrontier sorry for being mean the other day...


In [74]:
x = df.text
y = df.airline_sentiment

In [75]:

from sklearn.model_selection import train_test_split
SEED = 2000
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.02, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)

In [76]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
import multiprocessing
from sklearn import utils

def labelize_tweets_ug(tweets,label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(LabeledSentence(t.split(), [prefix + '_%s' % i]))
    return result
  
all_x = pd.concat([x_train,x_validation,x_test])
all_x_w2v = labelize_tweets_ug(all_x, 'all')

  from pandas import Panel
  if sys.path[0] == '':


In [77]:
all_x = pd.concat([x_train,x_validation,x_test])
all_x_w2v = labelize_tweets_ug(all_x, 'all')

  if sys.path[0] == '':


In [81]:
all_x_w2v[10]
#this what labelize tweet looks like

LabeledSentence(words=['hey', 'think', 'someone', 'could', 'meet', 'me', 'with', 'my', 'book', 'when', 'i', 'arrive', 'at', 'at', 'it', 's', 'yes', 'please', 'amypoehler', 'oscar', 'took', 'it'], tags=['all_3620'])

In [83]:
#build vocab
model_ug_dbow = Doc2Vec(dm=0, size=100, negative=5, min_count=2, workers=4, alpha=0.065, min_alpha=0.065)
model_ug_dbow.build_vocab([x for x in tqdm(all_x_w2v)])

100%|██████████| 14640/14640 [00:00<00:00, 265912.92it/s]


In [84]:
%%time
#Train the NN to get word embedings , Word Vectors
for epoch in range(30):
    model_ug_dbow.train(utils.shuffle([x for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_dbow.alpha -= 0.001
    model_ug_dbow.min_alpha = model_ug_dbow.alpha

100%|██████████| 14640/14640 [00:00<00:00, 702714.64it/s]
100%|██████████| 14640/14640 [00:00<00:00, 843388.83it/s]
100%|██████████| 14640/14640 [00:00<00:00, 867750.24it/s]
100%|██████████| 14640/14640 [00:00<00:00, 952112.80it/s]
100%|██████████| 14640/14640 [00:00<00:00, 843296.17it/s]
100%|██████████| 14640/14640 [00:00<00:00, 1056805.22it/s]
100%|██████████| 14640/14640 [00:00<00:00, 867676.67it/s]
100%|██████████| 14640/14640 [00:00<00:00, 1834013.64it/s]
100%|██████████| 14640/14640 [00:00<00:00, 867971.03it/s]
100%|██████████| 14640/14640 [00:00<00:00, 1017879.70it/s]
100%|██████████| 14640/14640 [00:00<00:00, 1054500.36it/s]
100%|██████████| 14640/14640 [00:00<00:00, 1553524.53it/s]
100%|██████████| 14640/14640 [00:00<00:00, 483862.82it/s]
100%|██████████| 14640/14640 [00:00<00:00, 440489.02it/s]
100%|██████████| 14640/14640 [00:00<00:00, 1225592.00it/s]
100%|██████████| 14640/14640 [00:00<00:00, 922572.95it/s]
100%|██████████| 14640/14640 [00:00<00:00, 1476107.85it/s]
100%|██

Wall time: 57.2 s


In [85]:
def get_vectors(model, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        prefix = 'all_' + str(i)
        vecs[n] = model.docvecs[prefix]
        n += 1
    return vecs

In [96]:
import numpy as np
train_vecs_dbow = get_vectors(model_ug_dbow, x_train, 100)
validation_vecs_dbow = get_vectors(model_ug_dbow, x_validation, 100)

In [95]:
train_vecs_dbow[0]

array([-0.31459734,  0.05857672, -0.09373038,  0.20645429, -0.0509001 ,
       -0.04946975, -0.31858325, -0.14537346, -0.66941994, -0.01894479,
        0.25924209,  0.24269211, -0.04018043,  0.29951391,  0.14582373,
        0.57815927,  0.06517559, -0.76638174,  0.10707678, -0.17196579,
        0.77296329,  0.39612919,  0.44807401, -0.32401827,  0.34141243,
       -0.40942451, -0.35081768,  0.58094424,  0.10882908,  0.0028272 ,
       -0.046775  ,  0.15882301, -0.23425579, -0.24412428,  0.31141734,
       -0.01923329, -0.11746588,  0.30261034, -0.14542373, -0.08741587,
        0.80030155, -0.18588898,  0.28747362, -0.35675997,  0.31254163,
        0.34251207,  0.6266402 , -0.52675885, -0.06801969,  0.03347588,
        0.03746403,  0.19598594,  0.32230553,  0.1780279 , -0.11842416,
        0.05042865, -0.28034997, -0.08156704,  0.1472228 , -0.23309168,
       -0.33467588,  0.68091482, -0.24073295, -0.22109973, -0.11626986,
       -0.02848517,  0.0330813 ,  0.11531822, -0.30272543, -0.42

In [90]:
validation_vecs_dbow[0]

array([-0.08672442, -0.4462676 , -0.30447775,  0.21677081,  0.01122478,
       -0.48521993, -0.2737031 , -0.4314875 , -0.3950983 ,  0.56958199,
        0.24252385,  0.026979  , -0.18463492, -0.38044858,  0.08399496,
        0.05719149,  0.23872964, -0.17824945,  0.08450142, -0.48350203,
        0.34060234,  0.18874976,  0.00457577,  0.03561156,  0.4397355 ,
       -0.50918812, -0.35386541,  0.08386976,  0.11599461, -0.2558915 ,
       -0.12931962,  0.10156544, -0.34727541, -0.20918702,  0.28329754,
       -0.00595026, -0.51317579,  0.37773061,  0.34114018, -0.26488337,
        0.55541879, -0.32762384,  0.10969518, -0.22907384, -0.18569569,
        0.15383907,  0.23259358, -0.32612538,  0.25151631,  0.10281659,
        0.03070224, -0.19287577,  0.06740264, -0.03647917,  0.15792011,
        0.01277175, -0.17274536, -0.03004599,  0.30016303, -0.53494048,
       -0.10917096,  0.20187664, -0.37911767, -0.34020674, -0.19597563,
        0.07272606, -0.33792499,  0.24224688,  0.25041366, -0.31

In [100]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(train_vecs_dbow, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [101]:
clf.score(validation_vecs_dbow, y_validation)

0.7534246575342466

In [102]:
model_ug_dbow.save('d2v_model_ug_dbow.doc2vec')
model_ug_dbow = Doc2Vec.load('d2v_model_ug_dbow.doc2vec')

In [103]:
model_ug_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [104]:
#Distributed Memory
cores = multiprocessing.cpu_count()
model_ug_dmc = Doc2Vec(dm=1, dm_concat=1, size=100, window=2, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_dmc.build_vocab([x for x in tqdm(all_x_w2v)])

100%|██████████| 14640/14640 [00:00<00:00, 468426.39it/s]


In [105]:
%%time
for epoch in range(30):
    model_ug_dmc.train(utils.shuffle([x for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_dmc.alpha -= 0.002
    model_ug_dmc.min_alpha = model_ug_dmc.alpha

100%|██████████| 14640/14640 [00:00<00:00, 922378.94it/s]
100%|██████████| 14640/14640 [00:00<00:00, 919933.04it/s]
100%|██████████| 14640/14640 [00:00<00:00, 737832.22it/s]
100%|██████████| 14640/14640 [00:00<00:00, 435030.89it/s]
100%|██████████| 14640/14640 [00:00<00:00, 922878.00it/s]
100%|██████████| 14640/14640 [00:00<00:00, 628006.69it/s]
100%|██████████| 14640/14640 [00:00<00:00, 984111.33it/s]
100%|██████████| 14640/14640 [00:00<00:00, 500167.88it/s]
100%|██████████| 14640/14640 [00:00<00:00, 868400.66it/s]
100%|██████████| 14640/14640 [00:00<00:00, 310692.33it/s]
100%|██████████| 14640/14640 [00:00<00:00, 819416.45it/s]
100%|██████████| 14640/14640 [00:00<00:00, 1180177.02it/s]
100%|██████████| 14640/14640 [00:00<00:00, 1644692.93it/s]
100%|██████████| 14640/14640 [00:00<00:00, 843330.91it/s]
100%|██████████| 14640/14640 [00:00<00:00, 894354.78it/s]
100%|██████████| 14640/14640 [00:00<00:00, 461123.66it/s]
100%|██████████| 14640/14640 [00:00<00:00, 868560.34it/s]
100%|███████

Wall time: 1min 18s


In [113]:

model_ug_dmc.most_similar('angry')

  """Entry point for launching an IPython kernel.


[('gracious', 0.5554074048995972),
 ('inconsiderate', 0.5172461867332458),
 ('stories', 0.515602171421051),
 ('unpleasant', 0.5132873058319092),
 ('thin', 0.5083214044570923),
 ('disorganized', 0.5031203627586365),
 ('often', 0.4885654151439667),
 ('frustrated', 0.48758840560913086),
 ('likely', 0.46571025252342224),
 ('cycles', 0.4572683572769165)]

In [114]:
model_ug_dmc.most_similar('bad')

  """Entry point for launching an IPython kernel.


[('impending', 0.4816121459007263),
 ('poor', 0.4799586534500122),
 ('extreme', 0.4722893238067627),
 ('makeup', 0.46474969387054443),
 ('lazy', 0.4473317265510559),
 ('blamed', 0.4359237849712372),
 ('beautiful', 0.4331815838813782),
 ('scollegelondon', 0.43048328161239624),
 ('simple', 0.42982858419418335),
 ('easy', 0.4262011647224426)]

In [115]:
model_ug_dmc.most_similar('late')

  """Entry point for launching an IPython kernel.


[('evenlate', 0.7859842777252197),
 ('unloaded', 0.5288441777229309),
 ('prior', 0.4974138140678406),
 ('dark', 0.49580979347229004),
 ('comms', 0.45956066250801086),
 ('canx', 0.45501184463500977),
 ('ago', 0.45331692695617676),
 ('deadhead', 0.44632986187934875),
 ('hs', 0.4401280879974365),
 ('worstflight', 0.42932993173599243)]

In [112]:
model_ug_dmc.most_similar(positive=['bigger', 'small'], negative=['big'])

  """Entry point for launching an IPython kernel.


[('understands', 0.45236513018608093),
 ('inside', 0.4419071078300476),
 ('compensating', 0.4377748668193817),
 ('vicky', 0.43512606620788574),
 ('snapchat', 0.42303600907325745),
 ('combined', 0.4224178194999695),
 ('issuing', 0.4216964542865753),
 ('placing', 0.4206436276435852),
 ('entering', 0.4195011258125305),
 ('opal', 0.4192591905593872)]