## Markov Chain based on Donald Trump's tweets

Objective was to create text generator based on Markov Chain with pure Python (without ready solutions).
In example I used json file with downloaded tweets, that I created with Twitter API.


In [73]:
import random
import json
import itertools

#### Load data and delete all unnecessary words

In [74]:
data = json.load(open('DonaldTrumpTweets.json',  encoding='utf-8'))
words = []
for tweet in data:
    words.append(tweet['text'])
words = list(itertools.chain(words))
words = ''.join(words).split()
words = [word for word in words if 'http' not in word 
                                 and '@' not in word
                                 and '#' not in word
                                 and ';' not in word]

#### As a result I've got list of words from Trump's tweets

In [75]:
words[:10]

['HAPPY',
 'THANKSGIVING!',
 'be',
 'doing',
 'a',
 'live',
 'Thanksgiving',
 'Video',
 'Teleconference',
 'with']

#### Let's create zero-order Markov Chain, meaning that all words chosen randomly

In [43]:
def generate_markov_text(words, size):
    seed = random.randint(0, len(words)-3)
    seed_word, next_word = words[seed], words[seed+1]
    w1, w2 = seed_word, next_word
    gen_words = []
    for i in range(size):
        gen_words.append(w1)
        w1, w2 = w2, random.choice(words)
        gen_words.append(w2)
    return ' '.join(gen_words)

#### Test results

In [77]:
generate_markov_text(words, 10)

'LEAKS on going in on America, in and America, HIRE and in HIRE AMERICA! in time AMERICA! has time come'

In [81]:
generate_markov_text(words, 10)

'Hillary as Clinton, Secretary as Mattis Secretary warns Mattis Kim warns Jung Kim Un Jung “he Un is “he grossly…RT'

In [93]:
generate_markov_text(words, 10)

'Party!Budget just that passed just is passed a is great a healthcare great and healthcare massive and tax massive cuts'

#### Let's do something better - 3rd order Markov Chain, meaning that we will group all words by three

In [94]:
def make_triples(words):
    output = []
    for i in range(len(words)):
        try:
            output.append((words[i], words[i+1], words[i+2]))
        except IndexError:
            break
    return output

#### All words grouped by three

In [95]:
make_triples(words)[:10]

[('HAPPY', 'THANKSGIVING!', 'be'),
 ('THANKSGIVING!', 'be', 'doing'),
 ('be', 'doing', 'a'),
 ('doing', 'a', 'live'),
 ('a', 'live', 'Thanksgiving'),
 ('live', 'Thanksgiving', 'Video'),
 ('Thanksgiving', 'Video', 'Teleconference'),
 ('Video', 'Teleconference', 'with'),
 ('Teleconference', 'with', 'Members'),
 ('with', 'Members', 'of')]

#### Generate dictionary, where for every pair of words will be list word that can be followed next

In [98]:
def words_grouping(words):
    dict_of_words = {}
    for w1, w2, w3 in make_triples(words):
        key = (w1, w2)
        if key in dict_of_words:
            dict_of_words[key].append(w3)
        else:
            dict_of_words[key] = [w3]
    return dict_of_words

In [97]:
words_grouping(words)

{('HAPPY', 'THANKSGIVING!'): ['be'],
 ('THANKSGIVING!', 'be'): ['doing'],
 ('be', 'doing'): ['a', 'a'],
 ('doing', 'a'): ['live',
  'joint',
  'FANTASTIC',
  'great',
  'GREAT',
  'GREAT',
  'fantastic',
  'GREAT',
  'GREAT',
  'great',
  'great',
  'fantastic',
  'great',
  'great',
  'Special',
  'great'],
 ('a', 'live'): ['Thanksgiving'],
 ('live', 'Thanksgiving'): ['Video'],
 ('Thanksgiving', 'Video'): ['Teleconference'],
 ('Video', 'Teleconference'): ['with'],
 ('Teleconference', 'with'): ['Members'],
 ('with', 'Members'): ['of'],
 ('Members', 'of'): ['the', 'Congress'],
 ('of', 'the'): ['Military',
  'National',
  'NFL’s',
  'year!',
  'United',
  'most',
  'Rep.',
  'rest',
  'Vietnam',
  'entire',
  'United',
  'leaders',
  'nations',
  'Indo-Pacific,',
  'U.S.',
  '”DEPLORABLES”',
  'U.S.',
  'biggest',
  'complete',
  'Dem',
  'dishonesty',
  'move',
  'horrible',
  'Fed.',
  'New',
  'Trump',
  'Fed',
  'will',
  'Senate',
  'Foreign',
  'race',
  'Senate',
  'process,',
  '

#### Let's modify text genarator

In [65]:
def generate_markov_text(words, size):
    seed = random.randint(0, len(words)-3)
    seed_word, next_word = words[seed], words[seed+1]
    w1, w2 = seed_word, next_word
    gen_words = []
    for i in range(size):
        gen_words.append(w1)
        w1, w2 = w2, random.choice(words_grouping(words)[(w1,w2)])
        gen_words.append(w2)
    return ' '.join(gen_words)
    

#### Checking results

In [111]:
generate_markov_text(words, 10)

'GREAT in job Hamburg. in Everybody Hamburg. felt Everybody totally felt safe totally despite safe the despite fact the that'

In [113]:
generate_markov_text(words, 10)

'Trump night Russia.Great in night Iowa in - Iowa special - people. special Thank people. you!Just Thank spoke you!Just to'

In [128]:
generate_markov_text(words, 10)

'ISIS responsibility claims for responsibility hostage for siege hostage in siege Melbourne, in Florida. Melbourne, See Florida. you See soon!'