In [7]:
def clean(doc):
    # input: string
    # output: string
    from nltk.stem.wordnet import WordNetLemmatizer
    from nltk.corpus import stopwords 
    import string
    lemma = WordNetLemmatizer()
    stop = set(stopwords.words('english'))
    exclude = set(string.punctuation) 
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude or ch == "!")
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split()).split()
    link_free = [word for word in normalized if 'http' not in word]
    cleaned = " ".join(word for word in link_free)
    
    return cleaned

In [8]:
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc4 = "Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better."
doc5 = "Health experts say that Sugar is not good for your lifestyle."

# compile documents
doc_complete = [doc1, doc2, doc3, doc4, doc5]
#doc_complete = [doc2]



doc_clean = [clean(doc).split() for doc in doc_complete]
#print(doc_clean)

import gensim
from gensim import corpora
import re

# Creating the term dictionary of our courpus, where every unique term is assigned an index. 

dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

Lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)

#topics = ldamodel.print_topics(num_topics=3, num_words=3)
topics = ldamodel.show_topics(num_topics=3, num_words=3)
print(topics[0][1])
percents = topics[0][1].split()
percents = [i for i in percents if i != "+"]
percents = [float(i[:5]) for i in percents]
topics = re.findall('"([^"]*)"', topics[0][1])
to_return = zip(topics, percents)
to_return = list(to_return)

print(topics, percents)
print(to_return)




0.076*"sugar" + 0.075*"good" + 0.075*"health"
['sugar', 'good', 'health'] [0.076, 0.075, 0.075]
[('sugar', 0.076), ('good', 0.075), ('health', 0.075)]


In [9]:
# train topic classifier

def trainLDA(tweets):
    # input: list of strings
    # output: lda model trained on strings
    import gensim
    from gensim import corpora
    # input: multiple strings
    # output: trained ldamodel
    tweets_clean = [clean(tweet).split() for tweet in tweets]
    dictionary = corpora.Dictionary(tweets_clean)
    tweet_term_matrix = [dictionary.doc2bow(tweet) for tweet in tweets_clean]
    Lda = gensim.models.ldamodel.LdaModel
    ldamodel = Lda(tweet_term_matrix, num_topics=3, id2word = dictionary, passes=50)
    return ldamodel

In [10]:
# get topics of specific tweet

def topics(tweets, ldamodel, num_words):
    # input:
    #     tweets: strings
    #     ldamodel: Lda model from trainLDA
    #     topics: integer number of topics to return per tweet
    #     num_words: integer number of words per topic
    # output:
    #     list of lists, each list within the list contains a tuple (topic, percentage)
    #     each list within the greater list is for each tweet
    import re
    
    to_return = []
    for tweet in tweets:
        topics_w_percents = ldamodel.print_topics(num_topics=1, num_words=num_words)

        percents = topics_w_percents[0][1].split()
        #print(percents)

        for_topics = [i for i in percents if i != "+"]
        
        percents = [float(i[:5]) for i in for_topics]
        for_topics = " ".join(word for word in for_topics)
        topics = re.findall('"([^"]*)"', for_topics)
        topics_percents_tuples = zip(topics, percents)
        topics_percents_tuples = list(topics_percents_tuples)
        to_return.append(topics_percents_tuples)
    
    return to_return

In [13]:
import nltk
import string as st
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
# getting sentiment of sentence
def sentiment(tweets):
    # input: list of strings
    # output: list of ints
    sentiments = []
    for tweet in tweets:
        tweet = clean(tweet)
        score = sid.polarity_scores(tweet)
        sentiments.append(score['compound'])
    return sentiments

In [14]:
tweet1 = "I love you"
tweet2 = "My day is always a little #better with some #wine"
tweet3 = "you're the worst"
tweet4 = 'wooow Happy Thanksgiving  You really are unlucky! #Fail https://t.co/lUMFCOtksX'
tweets = [tweet1,tweet2,tweet3,tweet4]
print(sentiment(tweets))
ldamodel = trainLDA(tweets)
print(topics(tweets, ldamodel, 3))




[0.6369, 0.3832, -0.6249, -0.0952]
[[('really', 0.118), ('unlucky!', 0.118), ('fail', 0.118)], [('worst', 0.249), ('love', 0.064), ('thanksgiving', 0.063)], [('really', 0.118), ('unlucky!', 0.118), ('fail', 0.118)], [('really', 0.118), ('unlucky!', 0.118), ('fail', 0.118)]]
