In [64]:
from sklearn.externals import joblib
logistic_model = joblib.load('logistic_model.pkl') 

In [58]:
vectorizer = joblib.load('vectorizer.pkl')

In [59]:
import numpy as np
import pandas as pd
import config
import re
from nltk.corpus import stopwords
from ast import literal_eval
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('display.max_colwidth', 200)

In [60]:
english_file = "{}/data/twitter/tweets/trivadis/2016-11_english.csv".format(config.dir_prefix)

In [61]:
tweets_english = pd.read_csv(english_file, encoding='utf-8', 
                              usecols = ['id_str', 'user_id', 'created_at', 'lang', 'text', 'favorite_count', 'entities',
                                         'in_reply_to_screen_name', 'in_reply_to_status_id_str', 'in_reply_to_user_id',
                                         'retweet_count', 'quoted_status_id_str', 'text_tokenized', 'text_processed'],
                              converters={"text_tokenized": literal_eval, "text_processed": literal_eval})

def remove_hash(wordlist):
    return(list(map(lambda x: re.sub(r'^#','',x), wordlist)))

def remove_at(wordlist):
    return(list(map(lambda x: re.sub(r'^@','',x), wordlist)))
    
tweets_english['text_wo_#'] = tweets_english['text_processed'].apply(lambda x: remove_hash(x))
tweets_english['text_wo_#@'] = tweets_english['text_wo_#'].apply(lambda x: remove_at(x))

X_train_en = tweets_english['text_wo_#'].apply(lambda x: ' '.join(x))

stopwords_nltk = set(stopwords.words("english"))
#relevant_words = set(['not', 'nor', 'no', 'wasn', 'ain', 'aren', 'very', 'only', 'but', 'don', 'isn', 'weren'])
relevant_words = set()
additional_stopwords = set(['us'])
stopwords_filtered = list(additional_stopwords.union(stopwords_nltk.difference(relevant_words)))

In [62]:
words_matrix = vectorizer.transform(X_train_en)
words_matrix

<2352x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 13576 stored elements in Compressed Sparse Row format>

In [68]:
y_pred = logistic_model.predict(words_matrix)
y_pred_prob = logistic_model.predict_proba(words_matrix)

In [69]:
len(X_train_en), len(y_pred)

(2352, 2352)

In [72]:
y_pred[:10], y_pred_prob[:10]

(array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1]), array([[ 0.7780896 ,  0.2219104 ],
        [ 0.7780896 ,  0.2219104 ],
        [ 0.38734115,  0.61265885],
        [ 0.3392787 ,  0.6607213 ],
        [ 0.41796271,  0.58203729],
        [ 0.18704924,  0.81295076],
        [ 0.43564413,  0.56435587],
        [ 0.33256163,  0.66743837],
        [ 0.4103714 ,  0.5896286 ],
        [ 0.1931281 ,  0.8068719 ]]))

In [73]:
y_pred_prob[:,1]

array([ 0.2219104 ,  0.2219104 ,  0.61265885, ...,  0.58498887,
        0.72374583,  0.56525636])

In [74]:
X_labeled = pd.DataFrame({'text': X_train_en, 'sentiment': y_pred, 'prob': y_pred_prob[:,1]})

In [75]:
X_labeled.head()

Unnamed: 0,prob,sentiment,text
0,0.22191,0,@AlphaHelixSMC It appears that Apple is still considering a foldable iPhone Would you use it https://t.co/kYdFRKzJm0
1,0.22191,0,@AlphaHelixSMC It appears that Apple is still considering a foldable iPhone Would you use it https://t.co/kYdFRKzJm0
2,0.612659,1,@dsgersten Learn more about @vfiorg and @CSECoalition https://t.co/lehZVz0VzJ
3,0.660721,1,Windows 10 deployment tools techniques and processes The must-know concepts for IT professionals de https://t.co/b7qdMQYg8z IAMCP
4,0.582037,1,Vcpkg updates Static linking is now available One month ago we announced the availability of Vcpkg https://t.co/V4Bw3EAtwz IAMCP


In [76]:
X_labeled.describe()

Unnamed: 0,prob,sentiment
count,2352.0,2352.0
mean,0.631852,0.772959
std,0.208081,0.419008
min,0.00746,0.0
25%,0.522758,1.0
50%,0.649853,1.0
75%,0.786578,1.0
max,0.993937,1.0


In [77]:
X_labeled[X_labeled['sentiment'] == 0].drop_duplicates()

Unnamed: 0,prob,sentiment,text
0,0.221910,0,@AlphaHelixSMC It appears that Apple is still considering a foldable iPhone Would you use it https://t.co/kYdFRKzJm0
19,0.311920,0,@connor_mc_d Android updates should just say This Release changes to make phone slower changes to make phone hotter changes
24,0.398722,0,Every time I see tweet with image of paper airplane I want to click link to read about how they made it @DirkMomont @akuli @peterjeffcock
25,0.133303,0,@HeyAlfredoDBA @opal_EPM @interRel @oracleace Great title Wish I could go back and change all of mine Look Smarter By Programming w PL SQL
28,0.447585,0,@MGralike ORACLE DATABASE 12.2 0.1 Oracle Search Index for JSON Search Dataguide and VC s https://t.co/bJ5L62Fw1H https://t.co/OXh…
31,0.421628,0,@MarkusWinand Is the ISO doc full of contradictions
39,0.260854,0,Want to make the most of @OracleDatabase Hard to find a better or more experienced practitioner than @DulcianInc https://t.co/rEUgVb9E9q
40,0.359349,0,@SharmanPete October 2016 Proactive BP got replaced https://t.co/blh3mQIQZY
61,0.112768,0,All devices must work together be integrated seamlessly But connecting them is not enough https://t.co/wJK4R7PP1N
79,0.048163,0,@FranckPachot @DOAGeV @dogy_doag it's much harder to break it with virtualization but it's not impossible ;)


## Now build a new model with all words

In [78]:
new_vectorizer = CountVectorizer(analyzer = "word", tokenizer = str.split, 
                                    stop_words = stopwords_filtered, max_features = 100000, ngram_range = (1,1))
new_matrix = new_vectorizer.fit_transform(X_labeled['text'])
new_feature_names = new_vectorizer.get_feature_names()
new_feature_names[:50]

['$',
 '%',
 '):',
 ');',
 '-->',
 '.',
 '..',
 '/8',
 '0',
 '0.0017',
 '0.1',
 '0.4',
 '000',
 '000th',
 '01031',
 '01:14',
 '02',
 '02:29',
 '02:44',
 '03',
 '03:06',
 '04',
 '0402',
 '04:45',
 '05',
 '05:59',
 '06:12',
 '1',
 '1.1',
 '1.3',
 '1.5',
 '1.7',
 '1/2',
 '1/3',
 '1/60',
 '10',
 '10.20',
 '10/11',
 '100',
 '100years',
 '101',
 '10:30',
 '10:37',
 '10:57',
 '11',
 '11.2',
 '11/09',
 '11/16',
 '11:15',
 '11:37']

In [79]:
y = X_labeled['sentiment']

In [80]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression() 
logistic_model.fit(new_matrix, y)
vocabulary = new_vectorizer.get_feature_names()
coefs = logistic_model.coef_
word_importances = pd.DataFrame({'word': vocabulary, 'coef': coefs.tolist()[0]})
word_importances_sorted = word_importances.sort_values(by='coef', ascending = False)
print(word_importances_sorted)

          coef        word
6226  1.760796      thanks
2955  1.403099        good
1922  1.283724       cloud
2763  1.231435       first
1616  1.226562     bigdata
2975  1.222469       great
4487  1.100433       learn
1249  0.946486        aced
2588  0.940701       event
6677  0.924985     working
5291  0.917230        post
4906  0.868813        nice
3033  0.820041       happy
4359  0.817983        join
4961  0.792030         obj
2875  0.776606      future
6353  0.767767      trends
4191  0.758316   important
1174  0.741658      @ukoug
1453  0.734291      around
1960  0.729872        come
1533  0.695489       azure
4465  0.690110      latest
6012  0.682709        stay
1409  0.670256        apex
2614  0.664168   excellent
4585  0.658856        love
5819  0.652467        show
1342  0.650447        also
2227  0.647972          db
...        ...         ...
4305 -0.841487      iphone
4320 -0.841808      issues
2535 -0.852612      enough
5555 -0.878466   reminders
5446 -0.880236     quickly
5