In [6]:
import pandas as pd # provide sql-like data manipulation tools. very handy.
pd.options.mode.chained_assignment = None
import numpy as np # high dimensional vector computing library.
from copy import deepcopy
from string import punctuation
from random import shuffle

import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class
LabeledSentence = gensim.models.doc2vec.LabeledSentence # we'll talk about this down below

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
tokenizer = TweetTokenizer()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [105]:
def ingest():
    data = pd.read_csv('training_1_sentiment_0.csv', encoding = "ISO-8859-1")

    data1 = pd.read_csv('training_2_sentiment_0.csv', encoding = "ISO-8859-1")
    data2 = pd.read_csv('violent_tweets_sentiment_1.csv', encoding = "ISO-8859-1")
    data3 = pd.read_csv('rapeTweets.csv', encoding = "ISO-8859-1")
    data3[['Sentiment']] = data3[['Sentiment']].apply(pd.to_numeric, errors='ignore')

    data = data.append(data1)
    data = data.append(data2)
    data = data.append(data3)
    #remove extra columns
#     data.drop(['Date'], axis=1, inplace=True)
    data = data.filter(['Sentiment','SentimentText'])
    data = data[data.Sentiment.isnull() == False]
    data['Sentiment'] = data['Sentiment'].map(int)
    #data['Sentiment'] = data['Sentiment'].map( {4:1, 0:0} )
    data = data[data['SentimentText'].isnull() == False]
    data.reset_index(inplace=True)
    data.drop('index', axis=1, inplace=True)
    print ('dataset loaded with shape', data.shape)   
    return data

data = ingest()
data.head(5)

ValueError: invalid literal for int() with base 10: 'Sentiment'

In [48]:
#Make a better classifier
def tokenize(tweet):
    try:
        tweet = tweet.lower()
        tokens = tokenizer.tokenize(tweet)
        tokens = list(filter(lambda t: not t.startswith('@'), tokens))
        tokens = list(filter(lambda t: not t.startswith('#'), tokens))
        tokens = list(filter(lambda t: not t.startswith('http'), tokens))
        return tokens
    except:
        return 'NC'

In [49]:
def postprocess(data, n=1000000):
    data = data.head(n)
    data['tokens'] = data['SentimentText'].progress_map(tokenize)  ## progress_map is a variant of the map function plus a progress bar. Handy to monitor DataFrame creations.
    data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data

In [50]:
data = postprocess(data)

progress-bar: 100%|██████████| 57047/57047 [00:05<00:00, 11166.24it/s]


In [51]:
data.head(5)

Unnamed: 0,Sentiment,SentimentText,tokens
0,0,@RealPatriot1976 @4everNeverTrump @MooreSenate...,"[you, forgot, "", thinks, pre-school, is, a, na..."
1,0,RT @boell_etics: En 2015 se adoptaron las regl...,"[rt, :, en, 2015, se, adoptaron, las, reglas, ..."
2,0,RT @Forbes: How To Look Like A Leader When You...,"[rt, :, how, to, look, like, a, leader, when, ..."
3,0,RT @SteveScalise: Our troops deserve it! https...,"[rt, :, our, troops, deserve, it, !]"
4,0,RT @LaScaldaferri: Smash cut to people standin...,"[rt, :, smash, cut, to, people, standing, in, ..."


In [52]:
n=1000000
x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(n).tokens),
                                                    np.array(data.head(n).Sentiment), test_size=0.2)

In [53]:
x_train[0]

['roads',
 ',',
 'schools',
 ',',
 'defense',
 ',',
 'weather',
 ',',
 'research',
 ',',
 'healthcare',
 'are',
 'all',
 'waste',
 ';',
 'because',
 'you',
 '\\',
 'u2026']

In [54]:
n_dim=200
token_count = sum([len(sentence) for sentence in x_train])
tweet_w2v = Word2Vec(size=n_dim, min_count=10)
tweet_w2v.build_vocab(x_train)
#tweet_w2v.train(x_train, total_examples=token_count, epochs=15)
tweet_w2v.train(x_train, total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter)
#total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.iter

3092475

In [55]:
tweet_w2v['good']

  """Entry point for launching an IPython kernel.


array([ -2.62365103e-01,  -1.22907841e+00,   3.05420667e-01,
         5.35618424e-01,   4.90612209e-01,   1.07360184e+00,
        -3.15761864e-01,   1.32183847e-03,  -1.64737141e+00,
        -3.48463178e-01,   1.16813409e+00,   2.28720784e-01,
         2.61147976e-01,  -1.55384743e+00,  -7.40929306e-01,
         5.20689189e-01,  -9.23383117e-01,   1.75207049e-01,
         6.49505496e-01,   5.08321166e-01,   1.45856693e-01,
         3.99922729e-01,  -1.42671013e+00,  -3.18788677e-01,
        -5.79857945e-01,  -2.03953072e-01,   3.89585346e-01,
         9.66223478e-01,  -3.35735381e-01,  -3.54420304e-01,
         5.31349838e-01,   9.23538864e-01,  -2.80488819e-01,
        -6.54184341e-01,   6.21118248e-01,  -4.48435396e-01,
        -5.98192692e-01,  -6.90549791e-01,  -7.86706924e-01,
        -1.02881871e-01,  -4.43934090e-02,   4.03930932e-01,
        -4.79687214e-01,  -3.02368373e-01,  -7.74409294e-01,
        -8.43152642e-01,  -9.27366734e-01,  -2.44783945e-02,
         6.24480247e-01,

In [56]:
tweet_w2v.most_similar('good')

  """Entry point for launching an IPython kernel.


[('great', 0.8634505271911621),
 ('wonderful', 0.8304198384284973),
 ('happy', 0.805043637752533),
 ('fun', 0.770256519317627),
 ('let', 0.7636302709579468),
 ('long', 0.7626844644546509),
 ('. .', 0.7594529390335083),
 ('better', 0.7498566508293152),
 ('pretty', 0.7480359077453613),
 ('too', 0.747560441493988)]

In [57]:
tweet_w2v.most_similar('rape')

  """Entry point for launching an IPython kernel.


[('cancer', 0.8544315099716187),
 ('undergoing', 0.8355053663253784),
 ('gangs', 0.8303781747817993),
 ('treatment', 0.8297376036643982),
 ('children', 0.8246831893920898),
 ('rohingya', 0.805081307888031),
 ('victims', 0.7936364412307739),
 ('develop', 0.7934758067131042),
 ('diabetes', 0.792036235332489),
 ('shots', 0.7877124547958374)]

In [58]:
tweet_w2v.most_similar('facebook')

  """Entry point for launching an IPython kernel.


[('store', 0.87354576587677),
 ('dragon', 0.8732419013977051),
 ('mocking', 0.8677797317504883),
 ('32', 0.8674526214599609),
 ('trailer', 0.8566007614135742),
 ('china', 0.8544414043426514),
 ('cod', 0.8483848571777344),
 ('vlog', 0.8468364477157593),
 ('featuring', 0.8451340198516846),
 ('stars', 0.8435724973678589)]

In [59]:
#tweet_w2v.most_similar('ocean')

In [60]:
# importing bokeh library for interactive dataviz
"""
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

# defining the chart
output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 10000 word vectors",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

# getting a list of word vectors. limit to 10000. each is of 200 dimensions
word_vectors = [tweet_w2v[w] for w in list(tweet_w2v.wv.vocab.keys())[:5000]]

# dimensionality reduction. converting the vectors to 2d vectors
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)

# putting everything in a dataframe
tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = list(tweet_w2v.wv.vocab.keys())[:5000]

# plotting. the corresponding word appears when you hover on the data point.
plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)
"""

'\nimport bokeh.plotting as bp\nfrom bokeh.models import HoverTool, BoxSelectTool\nfrom bokeh.plotting import figure, show, output_notebook\n\n# defining the chart\noutput_notebook()\nplot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 10000 word vectors",\n    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",\n    x_axis_type=None, y_axis_type=None, min_border=1)\n\n# getting a list of word vectors. limit to 10000. each is of 200 dimensions\nword_vectors = [tweet_w2v[w] for w in list(tweet_w2v.wv.vocab.keys())[:5000]]\n\n# dimensionality reduction. converting the vectors to 2d vectors\nfrom sklearn.manifold import TSNE\ntsne_model = TSNE(n_components=2, verbose=1, random_state=0)\ntsne_w2v = tsne_model.fit_transform(word_vectors)\n\n# putting everything in a dataframe\ntsne_df = pd.DataFrame(tsne_w2v, columns=[\'x\', \'y\'])\ntsne_df[\'words\'] = list(tweet_w2v.wv.vocab.keys())[:5000]\n\n# plotting. the corresponding word appears when you hover on the data p

In [61]:
print ('building tf-idf matrix ...')
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform(x_train)
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print ('vocab size :', len(tfidf))

building tf-idf matrix ...
vocab size : 7059


In [62]:
#Now let's define a function that, given a list of tweet tokens, creates an averaged tweet vector.
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [63]:
from sklearn.preprocessing import scale
train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in x_train])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in x_test])
test_vecs_w2v = scale(test_vecs_w2v)

  import sys


In [64]:
from keras.models import Sequential
from keras.layers import Activation, Dense
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=200))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_vecs_w2v, y_train, epochs=9, batch_size=32, verbose=2)

Epoch 1/9
 - 1s - loss: 0.2188 - acc: 0.9171
Epoch 2/9
 - 1s - loss: 0.1889 - acc: 0.9327
Epoch 3/9
 - 1s - loss: 0.1807 - acc: 0.9366
Epoch 4/9
 - 1s - loss: 0.1747 - acc: 0.9381
Epoch 5/9
 - 1s - loss: 0.1707 - acc: 0.9411
Epoch 6/9
 - 1s - loss: 0.1683 - acc: 0.9409
Epoch 7/9
 - 1s - loss: 0.1650 - acc: 0.9418
Epoch 8/9
 - 1s - loss: 0.1623 - acc: 0.9435
Epoch 9/9
 - 1s - loss: 0.1607 - acc: 0.9443


<keras.callbacks.History at 0x1a14534d68>

In [65]:
score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=2)
print (score[1])

0.941454864165


In [72]:
#for checking on test data
def ingest2():
    data = pd.read_csv('train.csv', encoding = "ISO-8859-1")
#     data.drop(['Date', 'Usage'], axis=1, inplace=True)
    data = data.filter(['Sentiment','SentimentText'])
    data = data[data.Sentiment.isnull() == False]
    data['Sentiment'] = data['Sentiment'].map(int)
    #data['Sentiment'] = data['Sentiment'].map( {4:1, 0:0} )
    data = data[data['SentimentText'].isnull() == False]
    data.reset_index(inplace=True)
    data.drop('index', axis=1, inplace=True)
    print ('dataset loaded with shape', data.shape)   
    return data

data2 = ingest2()
data2.head(5)

dataset loaded with shape (3947, 2)


Unnamed: 0,Sentiment,SentimentText
0,1,"""You fuck your dad."""
1,0,"""i really don't understand your point.\xa0 It ..."
2,0,"""A\\xc2\\xa0majority of Canadians can and has ..."
3,0,"""listen if you dont wanna get married to a man..."
4,0,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd..."


In [73]:
testData = postprocess(data2)

progress-bar: 100%|██████████| 3947/3947 [00:00<00:00, 6293.45it/s]


In [74]:
n=1000000
x_test_train, x_test, y_test_train, y_test = train_test_split(np.array(testData.head(n).tokens),
                                                    np.array(testData.head(n).Sentiment), test_size=0.0)

In [75]:
test_train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in x_test_train])
test_train_vecs_w2v = scale(test_train_vecs_w2v)

  import sys


In [76]:
#Accuracy on test data
score = model.evaluate(test_train_vecs_w2v, y_test_train, batch_size=128, verbose=2)
print (score[1])

0.61616417481


In [39]:
#model.predict()   <- returns a list of output

In [112]:
fields = ['created_at', 'id_str','SentimentText', 'user-id_str', 'user-name', 'user-screen_name',
          'user-location', 'user-url', 'user-description', 'user-protected', 'user-verified',
          'user-followers_count', 'user-friends_count', 'user-listed_count', 'user-favourites_count',
          'user-statuses_count', 'user-created_at', 'user-utc_offset', 'user-time_zone', 'user-geo_enabled',
          'user-lang', 'user-following', 'geo', 'coordinates', 'place', 'contributors', 'is_quote_status',
          'quote_count', 'reply_count', 'retweet_count', 'favorite_count', 'favorited', 'retweeted',
          'filter_level', 'lang',]

In [116]:
testDF = pd.read_csv("rawData.csv")

In [118]:
testDF.columns = fields

In [121]:
testDF.head(1)

Unnamed: 0,created_at,id_str,SentimentText,user-id_str,user-name,user-screen_name,user-location,user-url,user-description,user-protected,...,contributors,is_quote_status,quote_count,reply_count,retweet_count,favorite_count,favorited,retweeted,filter_level,lang
0,2017-12-14 21:01:44,941412915990749184,RT @mugen: girls after break up:\n\ndick appoi...,2604225667,Jolly Ty,tje___,Nine19 Raised,http://vsco.com/indigoty,"I’m a disrespectful, but i’m woke. #blm #ncat",False,...,,False,0,0,0,0,False,False,low,en


In [122]:
testDF = testDF.filter(['SentimentText'])

In [123]:
testDF.head(1)

Unnamed: 0,SentimentText
0,RT @mugen: girls after break up:\n\ndick appoi...
