In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words("english")
from nltk.tokenize import TweetTokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alberttamman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Load Data**

In [5]:
df = pd.read_csv('trainingandtestdata/training.1600000.processed.noemoticon.csv', encoding='latin-1', header = None, names=['sentiment', 'id', 'date', 'flag', 'user', 'tweet'])

In [6]:
from sklearn.utils import shuffle
df = shuffle(df)

df = df.head(100000)
df

Unnamed: 0,sentiment,id,date,flag,user,tweet
676615,0,2248449502,Fri Jun 19 20:38:34 PDT 2009,NO_QUERY,woodlandalyssa,monster headache hope i can sleep it off
529875,0,2195676258,Tue Jun 16 11:27:12 PDT 2009,NO_QUERY,VeronicanLife,I'm very sad because my fedex tracking number ...
1087153,4,1969397156,Fri May 29 23:54:38 PDT 2009,NO_QUERY,scrapchick,@cameron_crazy Are you beginning to nest w/ al...
1434265,4,2060598791,Sat Jun 06 19:12:31 PDT 2009,NO_QUERY,bridgetmarym,Hanging out with my mom
1332787,4,2016281000,Wed Jun 03 06:30:10 PDT 2009,NO_QUERY,glitterngold,Good Morning tweets! New day and new outlook o...
...,...,...,...,...,...,...
250090,0,1983131006,Sun May 31 12:33:48 PDT 2009,NO_QUERY,o0wowzers0o,"So it's true? Ms. Snyder did die? Wow, that's ..."
1515925,4,2175658181,Mon Jun 15 01:37:04 PDT 2009,NO_QUERY,ChelseaO92,is in my pj's
1070072,4,1966041577,Fri May 29 17:00:54 PDT 2009,NO_QUERY,xxanna,@skateramps sounds good im up for some JB Hi-...
984048,4,1834349005,Mon May 18 02:35:42 PDT 2009,NO_QUERY,3CB,"@shiskydadon oops, i think i DMed half and the..."


In [7]:
from sklearn.model_selection import StratifiedKFold
y = df['sentiment']
feats = [col for col in df.columns if col!= "sentiment"]
X = df[feats]
skf = StratifiedKFold(n_splits=5, random_state=12345, shuffle=True)
train_index, test_index = list(skf.split(X, y))[0]
df_test, df_train = df.iloc[test_index], df.iloc[train_index]

**Clean Data**
1. Remove Duplicate Rows
2. Remove Columns we wont use
3. Format Target column (sentiment) into 0/1

In [8]:
df_train = df_train.drop_duplicates(subset=['id'], keep='first')
df_train = df_train.drop_duplicates(subset=['id'], keep='first')

df_train.drop(columns=['id', 'flag', 'user'], inplace=True)
df_test.drop(columns=['id', 'flag', 'user'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [9]:
df_train["sentiment"] = df_train["sentiment"].apply(lambda s: 1 if s!=0 else s)
df_test["sentiment"] = df_test["sentiment"].apply(lambda s: 1 if s!=0 else s)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [10]:
df_test

Unnamed: 0,sentiment,date,tweet
1434265,1,Sat Jun 06 19:12:31 PDT 2009,Hanging out with my mom
31019,0,Mon Apr 20 00:51:50 PDT 2009,Thinks he needs someone to cuddle with
738279,0,Sun Jun 21 06:34:07 PDT 2009,Happy Father's Day.Sad day i have a wake to go...
1251324,1,Mon Jun 01 15:09:07 PDT 2009,hot hot hotttttttttttttttttttttttttttttttttttt...
195104,0,Sat May 30 03:50:20 PDT 2009,@littlemunchkin she's too small to come out ye...
...,...,...,...
1166434,1,Sun May 31 05:05:05 PDT 2009,3 liters of Pepsi light later I am almost awak...
1545469,1,Mon Jun 15 12:45:19 PDT 2009,@MileyMelody You sing?
767716,0,Tue Jun 23 14:24:13 PDT 2009,My daughter has nits and it's her primary sch...
907792,1,Mon May 04 07:14:33 PDT 2009,@ThrivingIvory the next single is &quot;Hey La...


**Feature Engineering**

- Note: These new columns were not used in baseline model but might be useful later

In [11]:
import re
df_train["hashtags"] = df_train["tweet"].apply(lambda x: ",".join(tag for tag in list(re.findall(r"#(\w+)", x))))
df_test["hashtags"] = df_test["tweet"].apply(lambda x: ",".join(tag for tag in list(re.findall(r"#(\w+)", x))))
df_train["hashtag_count"] = df_train["hashtags"].apply(lambda h: len(h.split(',')) if len(h)>0 else 0)
df_test["hashtag_count"] = df_test["hashtags"].apply(lambda h: len(h.split(',')) if len(h) >0 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


**Preprocessing**
1. Lower-case letters
2. Remove stop words 
3. Tokenize using twitter tokenizer and lemmatize
4. Remove punctuation

In [12]:
def clean_text(text):
    text = text.lower()
    text = re.sub('@[A-Za-z0–9]+', ' ', text)
    text = re.sub('#', ' ', text)
    text = re.sub('https?:\/\/\S+', ' ', text)
    return text
 

In [13]:
df_train["tweet"] = df_train["tweet"].apply(lambda x: clean_text(x))
df_test["tweet"] = df_test["tweet"].apply(lambda x: clean_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [14]:
def remove_stop_words(s):
    new_sent = ""
    for word in s.split():
        if word not in stop_words:
            new_sent += " " + word
    return new_sent

df_train["tweet"] = df_train["tweet"].apply(lambda s: remove_stop_words(s))

In [15]:
df_test["tweet"] = df_test["tweet"].apply(lambda s: remove_stop_words(s))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [16]:
def lemmatize_tokenize(tweet):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    tokenizer = TweetTokenizer()
    new_t = []
    for word in tokenizer.tokenize(tweet):
        new_word = re.sub(r'[^\w\s]', '', (word))
        if new_word != '':
            new_t.append(lemmatizer.lemmatize(new_word)) 
    return " ".join(new_t)

In [17]:
df_train["tweet"] = df_train["tweet"].apply(lambda t: lemmatize_tokenize(t))
df_test["tweet"] = df_test["tweet"].apply(lambda t: lemmatize_tokenize(t))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


**Modelling with Word2Vec & RNN**

In [18]:
from gensim.models import Word2Vec

documents = [text.split() for text in df_train.tweet]
size = 200
model = Word2Vec(size=size, window=7, min_count=10, workers=4)
model.build_vocab(documents)

In [19]:
model.train(documents, total_examples=len(documents), epochs=20)

(9809436, 12308620)

In [20]:
model.wv.most_similar("hate")

[('fuck', 0.4357706606388092),
 ('ugh', 0.4192028343677521),
 ('stupid', 0.39120057225227356),
 ('killing', 0.3799291253089905),
 ('ignored', 0.37581247091293335),
 ('kill', 0.37091881036758423),
 ('ruin', 0.36644476652145386),
 ('freaking', 0.36063283681869507),
 ('screw', 0.3564320206642151),
 ('stressing', 0.3537784218788147)]

In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.tweet)


In [22]:
from tensorflow.keras.preprocessing import sequence 

sequences_train = tokenizer.texts_to_sequences(df_train.tweet)
sequences_test = tokenizer.texts_to_sequences(df_test.tweet)
X_train_seq = sequence.pad_sequences(sequences_train, maxlen=100, value=0)
X_test_seq = sequence.pad_sequences(sequences_test, maxlen=100, value=0)

In [23]:
y_train = df_train["sentiment"]
y_test = df_test["sentiment"]

In [24]:
w_index = tokenizer.word_index

vocab_size = len(w_index) + 1

In [25]:
embedding_m = np.zeros((vocab_size, size))


In [26]:
for word, idx in list(w_index.items()):
    if word in list(model.wv.vocab.keys()):
        embedding_m[idx] = model.wv[word]

In [27]:
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, GRU,Bidirectional, Dropout, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.initializers import Constant

In [28]:
nn_model = Sequential()

In [29]:
emb_layer = Embedding(vocab_size, 200, weights=[embedding_m], input_length=100, trainable=False)
nn_model.add(emb_layer)
nn_model.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
nn_model.add(Dense(1, activation='sigmoid'))

In [30]:
nn_model.compile(loss='binary_crossentropy',optimizer="adam",metrics=['accuracy'])

In [31]:
from tensorflow.keras.callbacks import EarlyStopping

nn_model.summary()
callbacks = [EarlyStopping(monitor='val_accuracy', patience=0)]
nn_model.fit(X_train_seq, y_train, batch_size=128, epochs=12, validation_split=0.2, callbacks=callbacks)


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 200)          9596400   
_________________________________________________________________
bidirectional (Bidirectional (None, 200)               240800    
_________________________________________________________________
dense (Dense)                (None, 1)                 201       
Total params: 9,837,401
Trainable params: 241,001
Non-trainable params: 9,596,400
_________________________________________________________________
Epoch 1/12
 26/500 [>.............................] - ETA: 3:48 - loss: 0.6494 - accuracy: 0.6359

KeyboardInterrupt: 

In [33]:
from nltk.probability import FreqDist

In [44]:
' '.join([i for i in df_train["tweet"]]).split()


['monster',
 'headache',
 'hope',
 'sleep',
 'im',
 'sad',
 'fedex',
 'tracking',
 'number',
 'working',
 '2',
 'damn',
 'day',
 'cry',
 'new',
 'laptop',
 '_crazy',
 'beginning',
 'nest',
 'w',
 'cleaning',
 'think',
 'so',
 'time',
 'wanted',
 'pull',
 'appliance',
 'clean',
 'behind',
 'good',
 'morning',
 'tweet',
 'new',
 'day',
 'new',
 'outlook',
 'thing',
 'life',
 'too',
 'flail',
 'cant',
 'january',
 'yet',
 'brian',
 'austin',
 'green',
 'hott',
 'whatever',
 'happened',
 'original',
 '90210',
 'miss',
 'day',
 'lack',
 'communication',
 'somehow',
 'im',
 'confident',
 'physic',
 'psych',
 'king',
 'island',
 'today',
 'felt',
 'good',
 'sleep',
 'gettin',
 'foil',
 'put',
 'on',
 'cant',
 'beleev',
 '2',
 'month',
 'left',
 'love',
 'goin',
 'fast',
 'sorry',
 'caused',
 'inconv',
 'tc',
 'bye',
 'srikanth',
 'wishing',
 'imc',
 '09',
 'best',
 'summer',
 'residency',
 '1',
 'whoohoooo',
 'day',
 'start',
 'working',
 'again',
 'lol',
 'report',
 'thursday',
 'laying',
 '

In [45]:
flat_words = ' '.join([i for i in df_train["tweet"]]).split()

word_freq = FreqDist(flat_words)

word_freq.most_common(30)

[('im', 9187),
 ('day', 5314),
 ('good', 4668),
 ('get', 4367),
 ('like', 3942),
 ('go', 3909),
 ('love', 3491),
 ('u', 3455),
 ('work', 3421),
 ('today', 3345),
 ('time', 3331),
 ('going', 3193),
 ('cant', 3164),
 ('got', 3022),
 ('lol', 3016),
 ('back', 2858),
 ('one', 2850),
 ('know', 2778),
 ('really', 2490),
 ('want', 2431),
 ('it', 2396),
 ('night', 2323),
 ('see', 2304),
 ('well', 2284),
 ('still', 2226),
 ('think', 2223),
 ('new', 2119),
 ('2', 2078),
 ('thanks', 2073),
 ('home', 2029)]

In [60]:
from gensim.corpora import Dictionary

#create dictionary
text_dict = Dictionary([tweet.split() for tweet in df_train['tweet']])

#view integer mappings
text_dict.token2id

{'headache': 0,
 'hope': 1,
 'monster': 2,
 'sleep': 3,
 '2': 4,
 'cry': 5,
 'damn': 6,
 'day': 7,
 'fedex': 8,
 'im': 9,
 'laptop': 10,
 'new': 11,
 'number': 12,
 'sad': 13,
 'tracking': 14,
 'working': 15,
 '_crazy': 16,
 'appliance': 17,
 'beginning': 18,
 'behind': 19,
 'clean': 20,
 'cleaning': 21,
 'nest': 22,
 'pull': 23,
 'so': 24,
 'think': 25,
 'time': 26,
 'w': 27,
 'wanted': 28,
 'good': 29,
 'life': 30,
 'morning': 31,
 'outlook': 32,
 'thing': 33,
 'tweet': 34,
 'cant': 35,
 'flail': 36,
 'january': 37,
 'too': 38,
 'yet': 39,
 '90210': 40,
 'austin': 41,
 'brian': 42,
 'green': 43,
 'happened': 44,
 'hott': 45,
 'miss': 46,
 'original': 47,
 'whatever': 48,
 'communication': 49,
 'lack': 50,
 'confident': 51,
 'physic': 52,
 'psych': 53,
 'somehow': 54,
 'felt': 55,
 'island': 56,
 'king': 57,
 'today': 58,
 'beleev': 59,
 'fast': 60,
 'foil': 61,
 'gettin': 62,
 'goin': 63,
 'left': 64,
 'love': 65,
 'month': 66,
 'on': 67,
 'put': 68,
 'bye': 69,
 'caused': 70,
 'inco

In [62]:
tweets_bow = [text_dict.doc2bow(tweet.split()) for tweet in df_train['tweet']]

In [64]:
from gensim.models.ldamodel import LdaModel

k = 5
tweets_lda = LdaModel(tweets_bow,
                      num_topics = k,
                      id2word = text_dict,
                      random_state = 1,
                      passes=10)

tweets_lda.show_topics()

[(0,
  '0.012*"want" + 0.010*"like" + 0.009*"movie" + 0.009*"wanna" + 0.009*"go" + 0.009*"watching" + 0.008*"cant" + 0.007*"need" + 0.006*"sound" + 0.006*"out"'),
 (1,
  '0.030*"day" + 0.026*"good" + 0.021*"im" + 0.019*"today" + 0.019*"work" + 0.014*"going" + 0.013*"night" + 0.013*"go" + 0.012*"time" + 0.012*"home"'),
 (2,
  '0.027*"im" + 0.017*"love" + 0.017*"u" + 0.014*"know" + 0.013*"lol" + 0.013*"like" + 0.011*"cant" + 0.010*"it" + 0.008*"get" + 0.008*"thats"'),
 (3,
  '0.009*"twitter" + 0.009*"yes" + 0.009*"thank" + 0.008*"oh" + 0.008*"new" + 0.007*"got" + 0.006*"still" + 0.006*"lol" + 0.006*"no" + 0.006*"please"'),
 (4,
  '0.010*"thanks" + 0.009*"trying" + 0.008*"ð" + 0.006*"head" + 0.006*"food" + 0.006*"cry" + 0.006*"½" + 0.005*"face" + 0.005*"follower" + 0.005*"s"')]

In [77]:
print(tweets_lda.print_topics())
doc_lda = tweets_lda[tweets_bow]

[(0, '0.012*"want" + 0.010*"like" + 0.009*"movie" + 0.009*"wanna" + 0.009*"go" + 0.009*"watching" + 0.008*"cant" + 0.007*"need" + 0.006*"sound" + 0.006*"out"'), (1, '0.030*"day" + 0.026*"good" + 0.021*"im" + 0.019*"today" + 0.019*"work" + 0.014*"going" + 0.013*"night" + 0.013*"go" + 0.012*"time" + 0.012*"home"'), (2, '0.027*"im" + 0.017*"love" + 0.017*"u" + 0.014*"know" + 0.013*"lol" + 0.013*"like" + 0.011*"cant" + 0.010*"it" + 0.008*"get" + 0.008*"thats"'), (3, '0.009*"twitter" + 0.009*"yes" + 0.009*"thank" + 0.008*"oh" + 0.008*"new" + 0.007*"got" + 0.006*"still" + 0.006*"lol" + 0.006*"no" + 0.006*"please"'), (4, '0.010*"thanks" + 0.009*"trying" + 0.008*"ð" + 0.006*"head" + 0.006*"food" + 0.006*"cry" + 0.006*"½" + 0.005*"face" + 0.005*"follower" + 0.005*"s"')]


  and should_run_async(code)


In [71]:
from pyLDAvis import gensim, display

  and should_run_async(code)


In [78]:
pyLDAvis.enable_notebook(local = True)

vis = pyLDAvis.gensim.prepare(tweets_lda, tweets_bow, dictionary=tweets_lda.id2word)

  and should_run_async(code)


In [79]:
vis

  and should_run_async(code)
