In [1]:
from sklearn.externals import joblib
logistic_model = joblib.load('logistic_model.pkl') 

In [2]:
vectorizer = joblib.load('vectorizer.pkl')

In [36]:
import numpy as np
import pandas as pd
import config
import re
from nltk.corpus import stopwords
from ast import literal_eval
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('display.max_colwidth', 200)

In [4]:
english_file = "{}/data/twitter/tweets/trivadis/2016-11_english.csv".format(config.dir_prefix)

In [18]:
tweets_english = pd.read_csv(english_file, encoding='utf-8', 
                              usecols = ['id_str', 'user_id', 'created_at', 'lang', 'text', 'favorite_count', 'entities',
                                         'in_reply_to_screen_name', 'in_reply_to_status_id_str', 'in_reply_to_user_id',
                                         'retweet_count', 'quoted_status_id_str', 'text_tokenized', 'text_processed'],
                              converters={"text_tokenized": literal_eval, "text_processed": literal_eval})

def remove_hash(wordlist):
    return(list(map(lambda x: re.sub(r'^#','',x), wordlist)))

def remove_at(wordlist):
    return(list(map(lambda x: re.sub(r'^@','',x), wordlist)))
    
tweets_english['text_wo_#'] = tweets_english['text_processed'].apply(lambda x: remove_hash(x))
tweets_english['text_wo_#@'] = tweets_english['text_wo_#'].apply(lambda x: remove_at(x))

X_train_en = tweets_english['text_wo_#'].apply(lambda x: ' '.join(x))

stopwords_nltk = set(stopwords.words("english"))
#relevant_words = set(['not', 'nor', 'no', 'wasn', 'ain', 'aren', 'very', 'only', 'but', 'don', 'isn', 'weren'])
relevant_words = set()
additional_stopwords = set(['us'])
stopwords_filtered = list(additional_stopwords.union(stopwords_nltk.difference(relevant_words)))

In [19]:
words_matrix = vectorizer.transform(X_train_en)
words_matrix

<2352x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 13576 stored elements in Compressed Sparse Row format>

In [20]:
y_pred = logistic_model.predict(words_matrix)

In [21]:
len(X_train_en), len(y_pred)

(2352, 2352)

In [31]:
X_labeled = pd.DataFrame({'text': X_train_en, 'sentiment': y_pred})

In [32]:
X_labeled.head()

Unnamed: 0,sentiment,text
0,0,@AlphaHelixSMC It appears that Apple is still considering a foldable iPhone Would you use it https://t.co/kYdFRKzJm0
1,0,@AlphaHelixSMC It appears that Apple is still considering a foldable iPhone Would you use it https://t.co/kYdFRKzJm0
2,1,@dsgersten Learn more about @vfiorg and @CSECoalition https://t.co/lehZVz0VzJ
3,1,Windows 10 deployment tools techniques and processes The must-know concepts for IT professionals de https://t.co/b7qdMQYg8z IAMCP
4,1,Vcpkg updates Static linking is now available One month ago we announced the availability of Vcpkg https://t.co/V4Bw3EAtwz IAMCP


In [33]:
X_labeled.describe()

Unnamed: 0,sentiment
count,2352.0
mean,0.772959
std,0.419008
min,0.0
25%,1.0
50%,1.0
75%,1.0
max,1.0


In [34]:
X_labeled[X_labeled['sentiment'] == 0].drop_duplicates()

Unnamed: 0,sentiment,text
0,0,@AlphaHelixSMC It appears that Apple is still considering a foldable iPhone Would you use it https://t.co/kYdFRKzJm0
19,0,@connor_mc_d Android updates should just say This Release changes to make phone slower changes to make phone hotter changes
24,0,Every time I see tweet with image of paper airplane I want to click link to read about how they made it @DirkMomont @akuli @peterjeffcock
25,0,@HeyAlfredoDBA @opal_EPM @interRel @oracleace Great title Wish I could go back and change all of mine Look Smarter By Programming w PL SQL
28,0,@MGralike ORACLE DATABASE 12.2 0.1 Oracle Search Index for JSON Search Dataguide and VC s https://t.co/bJ5L62Fw1H https://t.co/OXh…
31,0,@MarkusWinand Is the ISO doc full of contradictions
39,0,Want to make the most of @OracleDatabase Hard to find a better or more experienced practitioner than @DulcianInc https://t.co/rEUgVb9E9q
40,0,@SharmanPete October 2016 Proactive BP got replaced https://t.co/blh3mQIQZY
61,0,All devices must work together be integrated seamlessly But connecting them is not enough https://t.co/wJK4R7PP1N
79,0,@FranckPachot @DOAGeV @dogy_doag it's much harder to break it with virtualization but it's not impossible ;)


## Now build a new model with all words

In [41]:
new_vectorizer = CountVectorizer(analyzer = "word", tokenizer = str.split, 
                                    stop_words = stopwords_filtered, max_features = 100000, ngram_range = (1,1))
new_matrix = new_vectorizer.fit_transform(X_labeled['text'])
new_feature_names = new_vectorizer.get_feature_names()
new_feature_names[:50]

['$',
 '%',
 '):',
 ');',
 '-->',
 '.',
 '..',
 '/8',
 '0',
 '0.0017',
 '0.1',
 '0.4',
 '000',
 '000th',
 '01031',
 '01:14',
 '02',
 '02:29',
 '02:44',
 '03',
 '03:06',
 '04',
 '0402',
 '04:45',
 '05',
 '05:59',
 '06:12',
 '1',
 '1.1',
 '1.3',
 '1.5',
 '1.7',
 '1/2',
 '1/3',
 '1/60',
 '10',
 '10.20',
 '10/11',
 '100',
 '100years',
 '101',
 '10:30',
 '10:37',
 '10:57',
 '11',
 '11.2',
 '11/09',
 '11/16',
 '11:15',
 '11:37']

In [42]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression() 
logistic_model.fit(_matrix, y_train)
vocabulary = vectorizer.get_feature_names()
coefs = logistic_model.coef_
word_importances = pd.DataFrame({'word': vocabulary, 'coef': coefs.tolist()[0]})
word_importances_sorted = word_importances.sort_values(by='coef', ascending = False)
print(word_importances_sorted)

NameError: name '_matrix' is not defined

In [47]:
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
test_tweet = 'This is % a $ cooool. #dummysmiley: :-) :-P <3 and --> some .. arrows < > -> <-- and what? why! where& (yes) [no] minus- plus+ \word /word2 ;-) :-( :-((( @zkajdan'
remove_list = ['?', '!', '.', '\\', '-', ':', '(', ')', '&', '’', '/', '[', ']', '…', '>', '<', '->', '<--', '+', '$', '%','-->','..']
test_tweet_tokenized = tokenizer.tokenize(test_tweet)
[i for i in test_tweet_tokenized if i not in remove_list]

['This',
 'is',
 'a',
 'cooool',
 '#dummysmiley',
 ':-)',
 ':-P',
 '<3',
 'and',
 'some',
 'arrows',
 'and',
 'what',
 'why',
 'where',
 'yes',
 'no',
 'minus',
 'plus',
 'word',
 'word',
 '2',
 ';-)',
 ':-(',
 ':-(',
 '@zkajdan']