In [344]:
import pickle
import nltk
import re
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize.casual import TweetTokenizer
from nltk.stem import WordNetLemmatizer
import emoji

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/peiyuns/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [345]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [346]:
# load in classifier and vectorizer
logistic_clf = pickle.load(open("Logistic_{C0.1}_{l1}.pk1", "rb"))  
vec = pickle.load(open("DictVectorizer.pk1", "rb"))

In [347]:
frequent_symbols = [":)", ":(", ":D", ":-)", ":p", ";)", ":/", "XD"]

In [348]:
# Get array of dictionary according to the list of tweets 
def get_dict(tweets):
    dict_arr = []
    for tweet in tweets:
        tweet_dict = {}
        for word in tweet:
            if word in tweet_dict:
                tweet_dict[word] += 1
            else:
                tweet_dict[word] = 1
        dict_arr.append(tweet_dict)
    return dict_arr

In [349]:
def preprocess(tweets):
    new_tweets = []
    for tweet in tweets:
        new_tweet = []
        for word in tokenizer.tokenize(tweet):

            # regular words
            if not (word in stop_words or re.search("[^a-zA-Z]",word)):  # if not stopwords and non-alphabetic
                new_tweet.append(lemmatizer.lemmatize(word))
            
            # hashtags
            elif word[0] == "#":  
                hashtag = handle_hashtag(word)
                if hashtag != -1:
                    new_tweet.append("#" + lemmatizer.lemmatize(hashtag))
            
            # symbols / emojis
            elif word in frequent_symbols or word in emoji.UNICODE_EMOJI:
                new_tweet.append(word)
                
        new_tweets.append(new_tweet)
    
    dict_arr = get_dict(new_tweets)
    return (dict_arr)

In [350]:
sent = "it's a great tweet :-)"
sent2 = "this is another tweet"

In [351]:
# tokenizer will downcase everything except for emoticons
tokenizer = TweetTokenizer(preserve_case=False)

### Option 1: Classify one tweet at a time

In [352]:
tweet_dicts = preprocess([sent])

In [353]:
X = vec.transform(tweet_dicts)

In [354]:
logistic_clf.predict(X)  # 1 for positive, 0 for negative

array([1])

### Option 2: Classify multiple tweets at a time

In [355]:
sents = [sent, sent2]

In [356]:
tweet_dicts = preprocess(sents)

In [357]:
X = vec.transform(tweet_dicts)

In [358]:
logistic_clf.predict(X)  # 1 for positive, 0 for negative

array([1, 0])

In [359]:
logistic_clf.classes_   # class order

array([0, 1])

In [360]:
logistic_clf.predict_proba(X)    # probabilities: one instance per row, in each row: p(neg), p(neg)

array([[0.00900718, 0.99099282],
       [0.88340501, 0.11659499]])