In [1]:
from sklearn.externals import joblib
logistic_model = joblib.load('logistic_model.pkl') 

In [2]:
vectorizer = joblib.load('vectorizer.pkl')

In [3]:
import numpy as np
import pandas as pd
import config
import re
from nltk.corpus import stopwords
from ast import literal_eval
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('display.max_colwidth', 200)

In [4]:
english_file = "{}/data/twitter/tweets/trivadis/2016-11_english.csv".format(config.dir_prefix)

In [5]:
tweets_english = pd.read_csv(english_file, encoding='utf-8', 
                              usecols = ['id_str', 'user_id', 'created_at', 'lang', 'text', 'favorite_count', 'entities',
                                         'in_reply_to_screen_name', 'in_reply_to_status_id_str', 'in_reply_to_user_id',
                                         'retweet_count', 'quoted_status_id_str', 'text_tokenized', 'text_processed'],
                              converters={"text_tokenized": literal_eval, "text_processed": literal_eval})

def remove_hash(wordlist):
    return(list(map(lambda x: re.sub(r'^#','',x), wordlist)))

def remove_at(wordlist):
    return(list(map(lambda x: re.sub(r'^@','',x), wordlist)))
    
tweets_english['text_wo_#'] = tweets_english['text_processed'].apply(lambda x: remove_hash(x))
tweets_english['text_wo_#@'] = tweets_english['text_wo_#'].apply(lambda x: remove_at(x))

X_train_en = tweets_english['text_wo_#'].apply(lambda x: ' '.join(x))

stopwords_nltk = set(stopwords.words("english"))
#relevant_words = set(['not', 'nor', 'no', 'wasn', 'ain', 'aren', 'very', 'only', 'but', 'don', 'isn', 'weren'])
relevant_words = set()
additional_stopwords = set(['us'])
stopwords_filtered = list(additional_stopwords.union(stopwords_nltk.difference(relevant_words)))

In [6]:
words_matrix = vectorizer.transform(X_train_en)
words_matrix

<8848x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 51689 stored elements in Compressed Sparse Row format>

In [7]:
y_pred = logistic_model.predict(words_matrix)
y_pred_prob = logistic_model.predict_proba(words_matrix)

In [8]:
len(X_train_en), len(y_pred)

(8848, 8848)

In [9]:
y_pred[:10], y_pred_prob[:10]

(array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1]), array([[ 0.7780896 ,  0.2219104 ],
        [ 0.7780896 ,  0.2219104 ],
        [ 0.38734115,  0.61265885],
        [ 0.3392787 ,  0.6607213 ],
        [ 0.41796271,  0.58203729],
        [ 0.18704924,  0.81295076],
        [ 0.43564413,  0.56435587],
        [ 0.33256163,  0.66743837],
        [ 0.4103714 ,  0.5896286 ],
        [ 0.1931281 ,  0.8068719 ]]))

In [10]:
y_pred_prob[:,1]

array([ 0.2219104 ,  0.2219104 ,  0.61265885, ...,  0.72501537,
        0.60737947,  0.59363513])

In [11]:
X_labeled = pd.DataFrame({'text': X_train_en, 'sentiment': y_pred, 'prob': y_pred_prob[:,1]})

In [12]:
X_labeled.head()

Unnamed: 0,prob,sentiment,text
0,0.22191,0,@AlphaHelixSMC It appears that Apple is still considering a foldable iPhone Would you use it https://t.co/kYdFRKzJm0
1,0.22191,0,@AlphaHelixSMC It appears that Apple is still considering a foldable iPhone Would you use it https://t.co/kYdFRKzJm0
2,0.612659,1,@dsgersten Learn more about @vfiorg and @CSECoalition https://t.co/lehZVz0VzJ
3,0.660721,1,Windows 10 deployment tools techniques and processes The must-know concepts for IT professionals de https://t.co/b7qdMQYg8z IAMCP
4,0.582037,1,Vcpkg updates Static linking is now available One month ago we announced the availability of Vcpkg https://t.co/V4Bw3EAtwz IAMCP


In [13]:
X_labeled.describe()

Unnamed: 0,prob,sentiment
count,8848.0,8848.0
mean,0.633092,0.772604
std,0.210701,0.419174
min,0.00414,0.0
25%,0.517983,1.0
50%,0.65503,1.0
75%,0.791489,1.0
max,0.998374,1.0


In [14]:
X_labeled[X_labeled['sentiment'] == 0].drop_duplicates()

Unnamed: 0,prob,sentiment,text
0,0.221910,0,@AlphaHelixSMC It appears that Apple is still considering a foldable iPhone Would you use it https://t.co/kYdFRKzJm0
19,0.311920,0,@connor_mc_d Android updates should just say This Release changes to make phone slower changes to make phone hotter changes
24,0.398722,0,Every time I see tweet with image of paper airplane I want to click link to read about how they made it @DirkMomont @akuli @peterjeffcock
25,0.133303,0,@HeyAlfredoDBA @opal_EPM @interRel @oracleace Great title Wish I could go back and change all of mine Look Smarter By Programming w PL SQL
28,0.447585,0,@MGralike ORACLE DATABASE 12.2 0.1 Oracle Search Index for JSON Search Dataguide and VC s https://t.co/bJ5L62Fw1H https://t.co/OXh…
31,0.421628,0,@MarkusWinand Is the ISO doc full of contradictions
39,0.260854,0,Want to make the most of @OracleDatabase Hard to find a better or more experienced practitioner than @DulcianInc https://t.co/rEUgVb9E9q
40,0.359349,0,@SharmanPete October 2016 Proactive BP got replaced https://t.co/blh3mQIQZY
61,0.112768,0,All devices must work together be integrated seamlessly But connecting them is not enough https://t.co/wJK4R7PP1N
79,0.048163,0,@FranckPachot @DOAGeV @dogy_doag it's much harder to break it with virtualization but it's not impossible ;)


## Now build a new model with all words

In [15]:
new_vectorizer = CountVectorizer(analyzer = "word", tokenizer = str.split, 
                                    stop_words = stopwords_filtered, max_features = 100000, ngram_range = (1,1))
new_matrix = new_vectorizer.fit_transform(X_labeled['text'])
new_feature_names = new_vectorizer.get_feature_names()
new_feature_names[:50]

['(:',
 '(=',
 '):',
 ');',
 '-',
 '-0.10',
 '-0.20',
 '.',
 '..',
 '...',
 '/8',
 '/=',
 '0',
 '0-229',
 '0.0',
 '0.0017',
 '0.02',
 '0.025',
 '0.1',
 '0.2',
 '0.3',
 '0.4',
 '0.49',
 '00',
 '000',
 '000,0000000',
 '00059',
 '000th',
 '01031',
 '01:14',
 '02',
 '02:20',
 '02:29',
 '02:44',
 '02:58',
 '03',
 '03:06',
 '03:09',
 '03:19',
 '04',
 '0402',
 '04:45',
 '05',
 '05:59',
 '06',
 '06:12',
 '07:06',
 '08',
 '08/2016',
 '09']

In [16]:
y = X_labeled['sentiment']

In [17]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression() 
logistic_model.fit(new_matrix, y)
vocabulary = new_vectorizer.get_feature_names()
coefs = logistic_model.coef_
word_importances = pd.DataFrame({'word': vocabulary, 'coef': coefs.tolist()[0]})
word_importances_sorted = word_importances.sort_values(by='coef', ascending = False)
print(word_importances_sorted)

           coef           word
16288  2.182070         thanks
15315  1.767712          share
4865   1.686835        bigdata
7866   1.659019          great
7815   1.636789           good
12035  1.611732           join
16286  1.565182          thank
11822  1.545531    interesting
5555   1.519439          cloud
8003   1.487266          happy
13274  1.474853           nice
7115   1.453083      excellent
4646   1.419229        awesome
4946   1.267557           blog
5882   1.266228           cool
7368   1.261529        finally
7291   1.231715       favorite
12706  1.226149      marketing
4292   1.204807        amazing
7540   1.201114        forward
12550  1.171989           love
4837   1.169131           best
15052  1.161541       sangam16
11638  1.120605      important
6448   1.102869    development
6623   1.099059           done
15084  1.067831          scala
7623   1.057533            fun
14573  1.055106          ready
7402   1.036352          first
...         ...            ...
11690 -1