We'll start by using the [markovify](https://github.com/jsvine/markovify/) library to make some social-media-sized utterances in the style of Jane Austen.  This will be the basis for generating a synthetic social media stream.

In [None]:
import markovify
import codecs

with codecs.open("austen.txt", "r", "cp1252") as f:
    text = f.read()

austen_model = markovify.Text(text, retain_original=False, state_size=3)

for i in range(10):
    print(austen_model.make_short_sentence(200))

We'll use spaCy to identify entities (mostly proper nouns and noun phrases) in these synthetic status updates and turn them into hashtags:

In [None]:
import spacy
nlp = spacy.load('en')

def make_sentence(model, length=200):
    return model.make_short_sentence(length)
    
def hashtagify_full(sentence):
    doc = nlp(sentence)
    for ent in doc.ents:
        sentence = sentence.replace(str(ent), "#%s" % str(ent).replace(" ", ""))
    return (sentence, ["#%s" % str(ent).replace(" ", "") for ent in doc.ents])

def hashtagify(sentence):
    result,_ = hashtagify_full(sentence)
    return result

for i in range(5):
    sentence = make_sentence(austen_model)
    print(sentence)
    print(hashtagify(sentence))

We'll now train two Markov models on positive and negative product reviews (taken from the [public-domain Amazon fine foods reviews dataset on Kaggle](https://www.kaggle.com/snap/amazon-fine-food-reviews/)).  We'll incorporate the results of these models into our synthetic social media stream.

In [None]:
import gzip

def train_markov_gz(fn):
    """ trains a Markov model on gzipped text data """
    with gzip.open(fn, "rt", encoding="utf-8") as f:
        text = f.read()
    return markovify.Text(text, retain_original=False, state_size=3)

negative_model = train_markov_gz("reviews-1.txt.gz")
positive_model = train_markov_gz("reviews-5-100k.txt.gz")

In [None]:
make_sentence(negative_model)

In [None]:
make_sentence(positive_model)

We can combine these models with relative weights, but this yields somewhat unusual results:

In [None]:
compound_model = markovify.combine([austen_model, negative_model, positive_model], [14, 3, 3])

In [None]:
for i in range(20):
    print(hashtagify(make_sentence(compound_model)))

As is more or less the case in the real world, we'll assume that a small percentage of users are responsible for the bulk of social media activity, and that the bulk of users are responsible for relatively few posts.  We'll model this with a table of random user IDs that has a collection of relatively few talkative users and relatively many moderate users; the proportion of utterances from talkative users to utterances from moderate users is the inverse of the proportion of talkative users to moderate users.

In [None]:
import numpy
import math
import collections

class UserTable(object):
    def __init__(self, size, weights=[8, 2]):
        self._talkative = collections.deque()
        self._moderate = collections.deque()
        self._size = size
        self._cutoff = float(weights[0]) / sum(weights)
        
        for i in range(size):
            new_uid = math.floor(numpy.random.uniform(10 ** 10))
            if numpy.random.uniform() >= self._cutoff:
                self._moderate.append(new_uid)
            else:
                self._talkative.append(new_uid)
    
    def random_uid(self):
        def choose_from(c):
            return c[math.floor(numpy.random.uniform() * len(c))]
        
        if numpy.random.uniform() >= self._cutoff:
            return choose_from(self._talkative)
        else:
            return choose_from(self._moderate)
        

In [None]:
import seaborn

We can see the number of times each user ID appears if we ask the `UserTable` for 1000 random user IDs:

In [None]:
ut = UserTable(100)
uids = [ut.random_uid() for i in range(1000)]
seaborn.countplot(uids)

In [None]:
def generate_tweets(models, weights=None, hashtag_weights=[8, 2], ut=None, seed_hashtags=[]):
    if weights is None:
        weights = [1] * len(models)
    
    if ut is None:
        ut = UserTable(10000)
    
    choices = []
    
    total_weight = float(sum(weights))
    
    for i in range(len(weights)):
        choices.append((float(sum(weights[0:i+1])) / total_weight, models[i]))
    
    def choose_model():
        r = numpy.random.uniform()
        for (p, m) in choices:
            if r <= p:
                return m
        return choices[-1][1]
    
    seen_hashtags = set()
    hashtags = []
    total_hashtag_weight = float(sum(hashtag_weights))
    for i in range(len(hashtag_weights)): 
        hashtags.append((float(sum(hashtag_weights[0:i+1])) / total_hashtag_weight, collections.deque()))
    
    def choose_from(c):
        idx = math.floor(numpy.random.uniform() * len(c))
        return c[idx]
    
    def store_hashtag(tag):
        if tag not in seen_hashtags:
            seen_hashtags.add(tag)
            r = numpy.random.uniform()
            for(p, deq) in hashtags:
                if r <= p:
                    deq.append(tag)
    
    def choose_hashtag():
        r = numpy.random.uniform()
        for(p, deq) in hashtags:
            if r <= 1.0 - p:
                return choose_from(deq)
        return choose_from(hashtags[0][1])
    
    for tag in seed_hashtags:
        store_hashtag(str(tag))
    
    while True:
        tweet, tags = hashtagify_full(make_sentence(choose_model()))
        for tag in tags:
            store_hashtag(str(tag))
        
        this_tweet_tags = set([str(t) for t in tags])
        
        if len(seen_hashtags) > 0:
            for i in range(min(numpy.random.poisson(3), len(seen_hashtags))):
                tag = choose_hashtag()
                if str(tag) not in this_tweet_tags:
                    this_tweet_tags.add(str(tag))
                    tweet += " %s" % str(tag)
            
        yield (ut.random_uid(), tweet)

In [None]:
seed_hashtags=["#ff", "#marketing", "#fail", "#followfriday", "#yolo", "#retweet", "#tbt", "#socialmedia", "#startup", "#blogpost", "#news", "#health"]

t = generate_tweets([austen_model, positive_model, negative_model, compound_model], [22, 4, 4, 2], seed_hashtags=seed_hashtags)

In [None]:
t

In [None]:
[next(t) for i in range(100)]

In [None]:
import cProfile

def timing(c):
    for _ in range(c):
        next(t)

cProfile.run('timing(2000)', 'generatestats')

In [None]:
import pstats
p = pstats.Stats('generatestats')
p.strip_dirs().sort_stats(-1).print_stats()