This code uses positive and negative tweets from the NLTK package and accomadating package functions to tokenize, clean, and build a sentiment analysis model from sample tweets. Afterwards this model will be used to analyze samples of tweets that we pull by key word in real time (hopefully).

In [2]:
import nltk
#nltk.download()
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('stopwords')

from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import re, string, random
#from nltk import FreqDist, classify, NaiveBayesClassifier 

In [3]:

positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

In [4]:
print(positive_tweets[0]) #just to show what is being stored in positive/negative/text

#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)


In [5]:
print(text[0])

RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain £170 billion per year! #BetterOffOut #UKIP


In [6]:
#Removes hyperlinks and other special symbols then groups elements of tokenized words together using lemmatization 
def cleanNoise(tokenized, stopWords = ()):
    cleaned = []
    for ele, mark in pos_tag(tokenized):
        ele = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', ele)
        ele = re.sub("(@[A-Za-z0-9_]+)","", ele)
        if mark.startswith("NN"):
            grp = 'n'
        elif mark.startswith('VB'):
            grp = 'v'
        else:
            grp = 'a'
            
         #lemmatizes  
        ele = WordNetLemmatizer().lemmatize(ele,grp)
        
        #Checks to make sure ele is not empty, not a repeat, and not punctuation
        if len(ele)>0:
            if ele not in string.punctuation:
                if ele.lower() not in cleaned:
                    cleaned.append(ele.lower())
    return cleaned
        

In [7]:
stopWords = stopwords.words('english')
test = twitter_samples.tokenized('positive_tweets.json')[0]
print(test)
print(cleanNoise(test,stopWords))

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']
['#followfriday', 'for', 'be', 'top', 'engage', 'member', 'in', 'my', 'community', 'this', 'week', ':)']


In [8]:
def modelIn(cleanInput):
    for x in cleanInput:
        yield dict([y, True] for y in x)

In [9]:
def modelMaker():
    posTweets = twitter_samples.strings('positive_tweets.json')
    negTweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    stopWords = stopwords.words('english')
    
    posTokens = twitter_samples.tokenized('positive_tweets.json')
    negTokens = twitter_samples.tokenized('negative_tweets.json')
    posCleanTokens = []
    negCleanTokens = []
    
    for x in posTokens:
        clean = cleanNoise(x,stopWords)
        posCleanTokens.append(clean)
    for x in negTokens:
        clean = cleanNoise(x,stopWords)
        negCleanTokens.append(x)
        
    #posTrue = {}
    #negTrue = {}
    #for x in posCleanTokens:
     #   posTrue.append([x, True])
    #for x in negCleanTokens:
     #   negTrue.append([x,True])
    posTrue = modelIn(posCleanTokens)
    negTrue = modelIn(negCleanTokens)
    
    posData = []
    negData = []
    
    for x in posTrue:
        posData.append((x, "Positive"))
    for x in negTrue:
        negData.append((x,"Negative"))
    
    posNegData = posData + negData
    
    #print(posNegData[:5])
    random.shuffle(posNegData)

    train_data = posNegData[:7000]
    test_data = posNegData[7000:]
    
    #right now the model uses Naive bayes but if we are going to do collapsed gibbs we will replace 
    #This part and we will also probably have to reformat the input
    classifier = nltk.NaiveBayesClassifier.train(train_data)
    return(classifier)
    

In [10]:
words = ['be','good','saw']
dict([x,True] for x in words)

{'be': True, 'good': True, 'saw': True}

In [11]:
model = modelMaker()

In [31]:
custom_tweet = "Ice is the best thing in the world"
custom_tokens = cleanNoise(word_tokenize(custom_tweet))
print(custom_tweet, model.classify(dict([token, True] for token in custom_tokens)))

Ice is the best thing in the world Positive


In [12]:
###Using twitter examples

In [1]:
import pandas as pd
import tweepy
from textblob import TextBlob

In [19]:
auth = tweepy.OAuthHandler()
auth.set_access_token()
api = tweepy.API(auth)

In [21]:
def get_tweets(hashtags, date, numtweets):

    tagged_tweets = tweepy.Cursor(api.search, q=hashtags, lang="en",since=date, tweet_mode="extended").items(numtweets)

    tweets_df = pd.DataFrame(columns=('Date and Time', 'id', 'source', 'Geo', 'Place', 'Retweets', 'tweet', 'Sentiment Polarity'))
    list_tweets = []
    i = 1
    for tweet in tagged_tweets:

        list_tweets.append(tweet.full_text)
        
        tweets_df.at[i, 'id'] = tweet.id
        tweets_df.at[i, 'source'] = tweet.source
        tweets_df.at[i, 'tweet'] = tweet.full_text
        tweets_df.at[i, 'Geo'] = tweet.geo
        tweets_df.at[i, 'Place'] = tweet.place
        tweets_df.at[i, 'Favorites'] = tweet.favorite_count
        tweets_df.at[i, 'Retweets'] = tweet.retweet_count
        tweets_df.at[i, 'Date and Time'] = tweet.created_at
        analysis = TextBlob(tweet.full_text)
        tweets_df.at[i, 'Sentiment Polarity'] = analysis.sentiment.polarity
        i = i + 1

    tweets_df.to_csv(hashtags+'tweets.csv', encoding='utf-8')
    #print(tweets_df.head())
    return(list_tweets)


In [22]:
tweets = get_tweets('bostoncollege', '2020-06-01', 100)

In [23]:
pos = 0
neg = 0
for x in tweets:
    custom_tokens = cleanNoise(word_tokenize(x))
    res = model.classify(dict([token, True] for token in custom_tokens))
    if res == 'Positive':
        pos+=1
    else:
        neg +=1
print(pos,neg)

91 9
