In [1]:
import os
import json
from pprint import pprint
import random
import numpy as np
from matplotlib import pyplot as plt

DATA_PATH = r"data\nyt_crosswords-cleaned"

In [2]:
puzzles_available = []

for year in os.listdir(DATA_PATH):
    if not year.isdecimal(): continue       # skip README.md
    for month in os.listdir(os.path.join(DATA_PATH, year)):
        for day in os.listdir(os.path.join(DATA_PATH, year, month)):
            puzzles_available.append((int(year), int(month), int(day.split(".")[0])))

print("Puzzles Available:", len(puzzles_available))

Puzzles Available: 14545


In [3]:
# Read a random sample of puzzles

# sample = random.sample(puzzles_available, 10_000)

# sample puzzles from EVEN days
sample = [puzz for puzz in puzzles_available if puzz[-1]%2==0]

clue_answer_pairs = []

for year, month, day in sample:
    path = os.path.join(DATA_PATH, f"{year}", f"{month:02d}", f"{day:02d}.json")
    with open(path, encoding="utf-8") as f:
        puzzle = json.load(f)
        for c, a in zip(
            puzzle["clues"]["across"] + puzzle["clues"]["down"],
            puzzle["answers"]["across"] + puzzle["answers"]["down"]
        ):
            clue = c[c.index(".")+2:]
            clue_answer_pairs.append((clue, a))

clues, answers = zip(*clue_answer_pairs)
clue_answer_pairs

[('Capital of Ghana', 'ACCRA'),
 ('Shows wear, as a chair cushion', 'SAGS'),
 ('Progeny of an old block', 'CHIP'),
 ('Yugoslav native', 'CROAT'),
 ("Guy Fawkes's forte", 'PLOT'),
 ('Window part', 'PANE'),
 ('Well-known drudge', 'CINDERELLA'),
 ('Status quo ___ bellum', 'ANTE'),
 ('Clocked', 'TIMED'),
 ('Bedroom piece', 'DRESSER'),
 ('Bull-rider of myth', 'EUROPA'),
 ('Thus, to Caesar', 'ITA'),
 ('Prior to', 'ERE'),
 ('Roman 1501', 'MDI'),
 ('Harvest goddess', 'OPS'),
 ('Haul', 'LUG'),
 ('Hallucinogen', 'LSD'),
 ('Word with after or new', 'MATH'),
 ('Condescended', 'STOOPED'),
 ('Alaskan native', 'ALEUT'),
 ('Common-Market initials', 'EEC'),
 ('Kind of transit', 'RAPID'),
 ('Custom-made in London', 'BESPOKE'),
 ('___ chance (no way)', 'NOTA'),
 ('Mil. man', 'SGT'),
 ('Turntable abbr.', 'RPM'),
 ('Holy day: Abbr.', 'SAB'),
 ('Handed or headed', 'RED'),
 ('Kind of text or cursor', 'PRE'),
 ("Bill's partner", 'COO'),
 ('Stomach: Prefix', 'GASTRO'),
 ('Composer Victor', 'HERBERT'),
 ('"Thar

In [15]:
# Split data into train and test
from sklearn.model_selection import train_test_split

clues_train, clues_test, answers_train, answers_test = train_test_split(
    clues, answers,
    test_size=0.001, shuffle=True
)

len(clues_train), len(clues_test)

(602694, 604)

In [16]:
from transformers import pipeline
# unmasker = pipeline('fill-mask', model='bert-base-uncased')

Downloading: 100%|██████████| 570/570 [00:00<00:00, 569kB/s]
Downloading: 100%|██████████| 420M/420M [00:14<00:00, 29.4MB/s] 
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 98.2kB/s]
Downloading: 100%|██████████| 226k/226k [00:00<00:00, 4.75MB/s]
Downloading: 100%|██████████| 455k/455k [00:00<00:00, 6.19MB/s]


In [32]:
# def mask(tuple):
#     clue, answer = tuple
#     if '___' in clue:
#         return clue.replace('___', '[MASK]')



In [28]:
unmasker('[MASK] Gay')

[{'sequence': 'and gay',
  'score': 0.015759432688355446,
  'token': 1998,
  'token_str': 'and'},
 {'sequence': 'gay gay',
  'score': 0.01522444374859333,
  'token': 5637,
  'token_str': 'gay'},
 {'sequence': '- gay',
  'score': 0.013067396357655525,
  'token': 1011,
  'token_str': '-'},
 {'sequence': 'michael gay',
  'score': 0.01116530504077673,
  'token': 2745,
  'token_str': 'michael'},
 {'sequence': 'not gay',
  'score': 0.009996343404054642,
  'token': 2025,
  'token_str': 'not'}]

In [37]:
training_set = zip(clues_train, answers_train)
training_set = list(training_set)

training_set

[('___ Gay', 'ENOLA'),
 ('"So there!"', 'AHA'),
 ("Tad's dad", 'ABE'),
 ('Jet boat brand', 'SEADOO'),
 ('Kind of watch or sign', 'STOP'),
 ('On the calm side', 'ALEE'),
 ('Evel doings?', 'STUNTS'),
 ('Precisely', 'TOAT'),
 ('Two hrs. earlier than N.Y.', 'MST'),
 ('Reveal, poetically', 'OPE'),
 ("Marilyn Monroe's real first name", 'NORMA'),
 ('Stimulate', 'GOOSE'),
 ('Obsequious', 'SERVILE'),
 ('Apocalyptic quartet', 'THEFOURHORSEMEN'),
 ('It may be organized', 'CRIME'),
 ('Conception', 'IDEA'),
 ('Those girls, to Juanita', 'ESAS'),
 ('Cuts (down)', 'MOWS'),
 ('Goof off', 'LOAF'),
 ('Bird in the spring', 'NESTER'),
 ('528i or Z3, e.g.', 'BMW'),
 ('Ethereal', 'AERY'),
 ('Autostrada sights', 'FIATS'),
 ('He loves: Lat.', 'AMAT'),
 ('Painter Guido ___', 'RENI'),
 ('Central spot', 'MIDST'),
 ('Noted violinist', 'ELMAN'),
 ('Op-ed piece', 'ESSAY'),
 ('Cereal grain', 'OAT'),
 ('Torrid', 'EQUATORIAL'),
 ('Wipe out', 'ERASE'),
 ('Malcontents', 'BADAPPLES'),
 ('Par allows two per green', 'PUTTS'

In [51]:
training_set = zip(clues_train, answers_train)
training_set = list(training_set)


def combine(tuple):
    clue, answer = tuple
    return clue + ' | ' + answer


# masked_training_set = map(mask, training_set)
combined_training_set = list(map(combine, training_set))
combined_training_set[1000:1010]

['Tare eradicator | WEEDER',
 'Black: Prefix | MELAN',
 'Wrangle | HERD',
 'Having the most pizazz | ZIPPIEST',
 'Absorb, as a loss | EAT',
 'Classic film duo | BOGARTANDBACALL',
 '"___ Said" (Neil Diamond hit) | IAMI',
 'Composer Copland | AARON',
 'What one of the five Olympic rings stands for | ASIA']

In [59]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

# text = "Replace me by any text you'd like."
# encoded_input = tokenizer(text, return_tensors='pt')
# output = model(**encoded_input)

# Tokenized input
# text = "Who was Jim Henson ? Jim Henson was a puppeteer"
# tokenized_text = tokenizer.tokenize(text)

# tokenized_training_set = list(map(tokenizer.tokenize, training_set))
# tokenized_training_set
tokenizer.tokenize(training_set[0:10])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TypeError: expected string or bytes-like object

In [64]:
a = tokenizer.convert_tokens_to_ids(tokenizer.tokenize('james and | jill'))
a

[2508, 1998, 1064, 10454]

In [61]:
def mask(tuple):
    clue, answer = tuple
    return clue + ' [MASK]'

tokenized_training_set = list(map(tokenizer.tokenize, combined_training_set[0:10]))

masked_training_set = list(map(mask, training_set[0:10]))

tokenized_training_set

[['_', '_', '_', 'gay', '|', 'en', '##ola'],
 ['"', 'so', 'there', '!', '"', '|', 'ah', '##a'],
 ['tad', "'", 's', 'dad', '|', 'abe'],
 ['jet', 'boat', 'brand', '|', 'sea', '##do', '##o'],
 ['kind', 'of', 'watch', 'or', 'sign', '|', 'stop'],
 ['on', 'the', 'calm', 'side', '|', 'ale', '##e'],
 ['eve', '##l', 'doing', '##s', '?', '|', 'stunts'],
 ['precisely', '|', 'to', '##at'],
 ['two',
  'hr',
  '##s',
  '.',
  'earlier',
  'than',
  'n',
  '.',
  'y',
  '.',
  '|',
  'ms',
  '##t'],
 ['reveal', ',', 'poetic', '##ally', '|', 'op', '##e']]

In [81]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

# text = "Replace me by any text you'd like."
# encoded_input = tokenizer(text, return_tensors='pt')
# output = model(**encoded_input)

# Tokenized input
# text = "Who was Jim Henson ? Jim Henson was a puppeteer"
# tokenized_text = tokenizer.tokenize(text)

# tokenized_training_set = list(map(tokenizer.tokenize, training_set))

########################
def mask(tuple):
    clue, answer = tuple
    return clue + ' [MASK]'

tokenized_training_set = list(map(tokenizer.tokenize, combined_training_set[0:10]))

masked_training_set = list(map(mask, training_set[0:10]))

masked_training_set
############################################


# Mask a token that we will try to predict back with `BertForMaskedLM`
# masked_index = 6
# tokenized_text[masked_index] = '[MASK]'
# assert tokenized_text == ['who', 'was', 'jim', 'henson', '?', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer']

# Convert token to vocabulary indices
# indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

indexed_tokens_set = list(map(tokenizer.convert_tokens_to_ids, tokenized_training_set))

# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
# segments_ids = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]

# def segment(tuple):
#     clue, answer = tuple
#     [0]* + [1]*

# segments_ids_set = [0]* + [1]*

# # Convert inputs to PyTorch tensors
# tokens_tensor = torch.tensor([indexed_tokens])
# segments_tensors = torch.tensor([segments_ids])

indexed_tokens_set

def segment(array):
    index = array.index(1064)
    return [0]*(index) + [1]*(len(array)-index-1)

segments_ids_set = list(map(segment, indexed_tokens_set))

def remove_bar(array):
    array.remove(1064)
    return array

indexed_tokens_set_no_bar = list(map(remove_bar, indexed_tokens_set))

indexed_tokens_set_no_bar


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[[1035, 1035, 1035, 5637, 4372, 6030],
 [1000, 2061, 2045, 999, 1000, 6289, 2050],
 [18819, 1005, 1055, 3611, 14863],
 [6892, 4049, 4435, 2712, 3527, 2080],
 [2785, 1997, 3422, 2030, 3696, 2644],
 [2006, 1996, 5475, 2217, 15669, 2063],
 [6574, 2140, 2725, 2015, 1029, 28465],
 [10785, 2000, 4017],
 [2048, 17850, 2015, 1012, 3041, 2084, 1050, 1012, 1061, 1012, 5796, 2102],
 [7487, 1010, 13805, 3973, 6728, 2063]]

ValueError: expected sequence of length 5 at dim 2 (got 6)

In [138]:
# # Load pre-trained model tokenizer (vocabulary)
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# # Tokenized input
# text = "Who was Jim Henson ? Jim Henson was a puppeteer"
# tokenized_text = tokenizer.tokenize(text)

# # Mask a token that we will try to predict back with `BertForMaskedLM`
# masked_index = 6
# tokenized_text[masked_index] = '[MASK]'
# assert tokenized_text == ['who', 'was', 'jim', 'henson', '?', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer']

# # Convert token to vocabulary indices
# indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
# segments_ids = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]

# # Convert inputs to PyTorch tensors
# tokens_tensor = torch.tensor([indexed_tokens])
# segments_tensors = torch.tensor([segments_ids])



# ###########################
# # Load pre-trained model (weights)
# model = BertModel.from_pretrained('bert-base-uncased')
# model.eval()

# # Predict hidden states features for each layer
# encoded_layers, _ = model(tokens_tensor, segments_tensors)
# # We have a hidden states for each of the 12 layers in model bert-base-uncased
# assert len(encoded_layers) == 12

#################
# Load pre-trained model (weights)

from transformers import BertTokenizer, BertModel

model = BertModel.from_pretrained("bert-base-uncased")
# model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.eval()

# # Predict all tokens
# predictions = model(tokens_tensor, segments_tensors)

predicted_tokens = []
for i in range(len(indexed_tokens_set_no_bar)):
    tokens_tensor = torch.tensor([indexed_tokens_set_no_bar[i]])
    segments_tensors = torch.tensor([segments_ids_set[i]])
    predictions = model(tokens_tensor, segments_tensors)

    predicted_token = ''
    for masked_index, seg in enumerate(segments_ids_set[i]):
        if seg == 1:
            # print('predictions[0, masked_index]: ', predictions[0, masked_index])
            predicted_index = torch.argmax(predictions[1][0][masked_index]).item()
            predicted_token += tokenizer.convert_ids_to_tokens([predicted_index])[0]
            predicted_tokens.append(predicted_token)

    
    
predicted_tokens

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


['[PAD]',
 '[PAD][PAD]',
 '[PAD]',
 '[PAD][PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD][PAD]',
 '[PAD][PAD][PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD][PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD][PAD]',
 '[PAD]',
 '[PAD][PAD]',
 '[PAD]',
 '[PAD][PAD]']

In [134]:
predictions[0][0][0][masked_index]

(predictions[1][0])
# predictions[0][0][0]

tensor([-0.8997, -0.1846,  0.9085,  0.7744, -0.2565, -0.4150,  0.7723,  0.1466,
         0.7607, -0.9805,  0.8177, -0.2590,  0.9814, -0.6917,  0.9786, -0.3791,
        -0.1542, -0.0109,  0.3536, -0.9248,  0.8939,  0.4820,  0.7617,  0.0781,
         0.2364,  0.1479, -0.3248,  0.9689,  0.9634,  0.7483, -0.8455,  0.3740,
        -0.9888, -0.2416,  0.4720, -0.9288,  0.2848, -0.7297, -0.2482, -0.2317,
        -0.9221,  0.2619,  0.9061, -0.7493, -0.0290, -0.1715, -0.9961,  0.0325,
        -0.9087, -0.9480, -0.8676, -0.7280,  0.0898,  0.2811,  0.0951,  0.0253,
        -0.0892,  0.0771,  0.1501, -0.3453, -0.3214,  0.3783,  0.6601, -0.8300,
        -0.9023, -0.9391, -0.0726, -0.0365, -0.3162, -0.0547,  0.8513,  0.3637,
         0.8312, -0.9193, -0.8806,  0.1581, -0.4146,  0.9986, -0.5425, -0.9855,
        -0.0817, -0.8352,  0.4440,  0.9257, -0.7482, -0.9976, -0.0940,  0.0430,
        -0.9918,  0.1587,  0.5500, -0.0397, -0.6453,  0.5038,  0.4272, -0.4005,
         0.0391,  0.6817, -0.3490, -0.09

<function BaseModelOutputWithPoolingAndCrossAttentions.values>

In [91]:
torch.tensor(segments_ids_set[1])

tensor([1000, 2061, 2045,  999, 1000, 6289, 2050])

In [6]:
# # Vectorize Clues
# from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer = TfidfVectorizer(strip_accents="unicode", stop_words="english")
# X_train = vectorizer.fit_transform(np.array(clues_train))
# X_train

# # Feature Engineering Ideas
# # - clue is a pun (ends in "?")
# # - clue is a proper noun (words are capitalized)

import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenized input
text = "Who was Jim Henson ? Jim Henson was a puppeteer"
tokenized_text = tokenizer.tokenize(text)

# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 6
tokenized_text[masked_index] = '[MASK]'
assert tokenized_text == ['who', 'was', 'jim', 'henson', '?', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer']

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

ModuleNotFoundError: No module named 'pytorch_pretrained_bert'

In [None]:
# # Build a Nearest Neighbors Model
# from sklearn.neighbors import NearestNeighbors

# model = NearestNeighbors().fit(X_train)
# model

NearestNeighbors()

In [None]:
# Define a guesser function (just for testing; real one defined in guesser.py)
def guess(clue, slot_length=None, max_guesses=5, max_guesses_raw=30):
    clue_vector = vectorizer.transform([clue])
    distances, indices = model.kneighbors(clue_vector, n_neighbors=max_guesses_raw)
    raw_guesses = [answers_train[i] for i in indices[0]]
    # print([clues_train[i] for i in indices[0]])

    def valid(g):
        o = True
        if slot_length:
            o &= len(g) == slot_length
        o &= g.lower() not in clue.lower()
        return o
    
    guesses = [(g, d) for g, d in zip(raw_guesses, distances[0]) if valid(g)]
    return guesses[:max_guesses]

    # TODO:
    # - include a confidence with each guess
    # - use repeated guesses and distances to determine confidence


In [None]:
i = random.randint(0, len(clues_test))
test_clue, test_answer = clues_test[i], answers_test[i]
print("clue:", test_clue)
print("answer:", test_answer)

# distances, indices = model.kneighbors(vectorizer.transform([test_clue]), n_neighbors=10)
# print("guesses:", [answers_train[i] for i in indices[0]])

guesses = guess(test_clue, slot_length=len(test_answer))
print("guesses:", guesses)

clue: Unruffled, per Gelett Burgess
answer: CALMASANICEBERG
guesses: [('COOLASACUCUMBER', 0.963489171624135)]


In [None]:
correct_pairs = []
incorrect_pairs = []

for i, (clue, answer) in enumerate(zip(clues_test, answers_test)):
    if i % 100 == 0: print(i)
    guesses = guess(clue, len(answer), max_guesses=5)
    if answer in [g for g, _  in guesses]:
        correct_pairs.append((clue, answer, guesses))
    else:
        incorrect_pairs.append((clue, answer, guesses))

print(f"Accuracy: {len(correct_pairs) / len(clues_test):0.3%}")
# print(incorrect_pairs)

0
100
200
300
400
500
600
Accuracy: 51.325%


In [None]:
incorrect_pairs

[('Copiers',
  'APERS',
  [('AFOOT', 1.0), ('ASDOI', 1.0), ('TODAY', 1.0), ('TOWIT', 1.0)]),
 ('Air',
  'MIEN',
  [('SONG', 0.0), ('TUNE', 0.0), ('AURA', 0.0), ('AURA', 0.0), ('VENT', 0.0)]),
 ("Drag queen's collection",
  'WIGS',
  [('BOAS', 1.0536712127723509e-08),
   ('HAUL', 0.8401639537921233),
   ('BORE', 0.8401639537921233),
   ('HAUL', 0.8401639537921233),
   ('TOKE', 0.8401639537921233)]),
 ('Old NATO target',
  'USSR',
  [('PACT', 0.7559048542804673),
   ('PACT', 0.7559048542804673),
   ('PACT', 0.7559048542804673),
   ('PACT', 0.7559048542804673),
   ('PACT', 0.7559048542804673)]),
 ('Byproduct of an exhausted dairy cow?', 'WHIPPEDBUTTER', []),
 ('Nightclub in the Trump Taj',
  'CASBAH',
  [('BISTRO', 0.9899944489750127), ('BOREON', 1.0)]),
 ("Navy-ship V.I.P.'s",
  'XOS',
  [('ONI', 0.6721731113711898),
   ('ONI', 0.6721731113711898),
   ('ONI', 0.6721731113711898),
   ('SNO', 0.6721731113711898),
   ('ADM', 0.6721731113711898)]),
 ('Some toy bears, informally',
  'POOHS',


In [None]:
guess("opposite of NNE")

[('SSW', 0.0),
 ('SSW', 0.0),
 ('SSW', 0.0),
 ('LILLE', 0.8641804900624891),
 ('LILLE', 0.8641804900624891)]

In [None]:
# # serialize and save model
# import pickle

# TRAINED_MODEL_PATH = "trained_model.p"

# pickle.dump((answers_train, vectorizer, model), open(TRAINED_MODEL_PATH, "wb"))