## References

In [None]:
# https://radimrehurek.com/gensim/tut1.html
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
# https://docs.python.org/2/library/re.html

## Notebook Setup

In [None]:
# Import libraries
import io
import logging
import nltk
import pandas as pd
import pickle
from gensim import corpora
from gensim.models import Phrases
from nltk.stem.wordnet import WordNetLemmatizer

In [None]:
# Download stopwords and lemmatizer from nltk package
nltk.download("stopwords")
nltk.download("wordnet")

In [None]:
# Log events
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Load and Inspect Dataset

In [None]:
# Dead data with timestamp as index
tweets = pd.read_csv("../data/tweets.csv", encoding="latin1", parse_dates=True, 
                     index_col="created", usecols=range(1,28))

In [None]:
# Inspect dataframe
tweets.head()

In [None]:
# Display dataframe info
tweets.info()

In [None]:
# Describe dataframe 
tweets.describe()

In [None]:
# Divide dataset according to language: extract english language
english_tweets = tweets[tweets["language3"] == "ENGLISH"].copy()

## Data Cleaning and Preparation

### Preprocess Twitter Text

In [None]:
# Display text
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
pd.set_option("display.max_colwidth", -1)

english_tweets["text"]

In [None]:
# Remove links
english_tweets["text_clean"] = english_tweets["text"].str.replace(r"http\S+", "")

# Remove emoticons
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"<.*>", "")

# Remove punctuation, special characters etc.
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"&amp", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\.", " ")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\,", " ")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\;", " ")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\-", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\"", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace("\\\\", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace("\/", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace("\*", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"@", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\n", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\|", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"W//", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"!", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"~", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r")", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"(", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"?", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r":", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\+", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\{", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\}", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"_", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"h:[0-9]+m:[0-9]+s", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"[0-9]+", " ")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"w/", " ")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"[\x97]+", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"[\x96]+", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"[\x95]+", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"[\x94]+", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"[\x93]+", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"[\x92]+", "'")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"[\x91]+", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"[\x85]+", "")

# Reduce white spaces to 1
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r" +", " ")

In [None]:
# Display cleaned text
english_tweets["text_clean"]

In [None]:
# Reorder columns
english_tweets.columns
cols = ['text', 'text_clean', 'favoriteCount', 'replyToSN', 'truncated', 'replyToSID',
       'replyToUID', 'statusSource', 'retweetCount', 'longitude', 'latitude',
       'id_seccion', 'horaPeticion', 'id_distrito', 'grupoHora',
       'id_seccion_xy', 'favoriteCountOutlier', 'retweetCountOutlier',
       'tweetcount', 'movement', 'language3', 'dayofweek', 'weeknumber',
       'month', 'idBarrio_xy', 'idBarrio', 'user']

english_tweets = english_tweets[cols]

In order to train the topic models, 3 different pooling methods for the creation of documents are used: No Pooling (1), User Pooling (2) and Hashtag Pooling (3).

### Training Documents Option 1 (No Pooling)

In [None]:
# Treat every tweet as a different document (no pooling)
documents = english_tweets["text_clean"].tolist()

### Training Documents Option 2 (User Pooling)

In [None]:
# Treat all tweets by one user as one single document (user pooling)
user_combined = english_tweets[["text_clean","user"]].groupby("user")["text_clean"].apply(lambda x: "".join(x))
documents_user_pooling = user_combined.tolist()

### Training Documents Option 3 (Hashtag Pooling)

In [None]:
# Treat all tweets with the same hashtag as one single document (hashtag pooling)

# Find all hashtags
english_tweets["hashtags"] = english_tweets["text_clean"].str.findall(r'#.*?(?=\s|$)')

# Separate hashtags in columns
hashtags_tweets = pd.DataFrame(english_tweets["hashtags"].tolist(),
                               columns=["hashtag1", "hashtag2", "hashtag3", "hashtag4",
                                        "hashtag5", "hashtag6", "hashtag7", "hashtag8",
                                        "hashtag9", "hashtag10", "hashtag11", "hashtag12",
                                        "hashtag13"])

# Join hashtags with tweet text
hashtags_tweets.index = english_tweets.index
hashtags_tweets = english_tweets.join(hashtags_tweets)

In [None]:
# Create one dataframe with text for each hashtag column and save them in a dictionary
dictionary = {}
for index, item in enumerate(["hash1", "hash2", "hash3", "hash4", "hash5",
                              "hash6", "hash7", "hash8", "hash9", "hash10",
                              "hash11", "hash12", "hash13"]):
    dictionary[item] = hashtags_tweets[["hashtag" + str(index + 1), "text_clean"]].copy()
    dictionary[item].columns = ["hashtag", "text"]
    dictionary[item].dropna(inplace=True)

# Concatenate all dataframes to one dataframe (the result is a dataframe
# where there is text for each hashtag found)
hashtags = pd.DataFrame()
for item in dictionary:
    hashtags = pd.concat([hashtags, dictionary[item]])

# Combine text for each hashtag
hashtags_combined = hashtags.groupby("hashtag")["text"].apply(lambda x: "".join(x))

In [None]:
# Remove some generic hashtags that cover a lot of different topics
hashtags_combined.drop(["#Barcelona", "#Catalunya", "#Spain", "#BCN", "#BARCELONA",
                        "#Espana", "#BarcelonaSpain"], inplace=True)

In [None]:
# Create documents
documents_hashtag_pooling = hashtags_combined.tolist()

### Analyze Documents

In [None]:
# Calculate number of terms per document
total = []
for item in documents:
    total.append(len(item.split()))

In [None]:
# Calculate total number of terms
sum = 0
for number in total:
    sum += number

In [None]:
# Calculate average number of terms per document
sum/len(documents)

### Prepare Test Documents

The trained topic models will then be used to determine the topics of test documents.

The first objective of the research is to analyze the distribution of topics over the districts. For this purpose, district pooling is used to create the documents that will be tested.

In [None]:
# Merge all tweets from each district (district pooling) and treat them as one single document respectively
district_combined = english_tweets[["text_clean","id_distrito"]].groupby("id_distrito")["text_clean"].apply(lambda x: "".join(x))
documents_district_pooling = district_combined.tolist()

In [None]:
# Check documents
district_combined

In [None]:
# Check if order is right
district_combined.index # correct

The second objective is to look at the dynamic topic development over time. For this purpose, the dataset is divided according to time and documents are created on this basis.

Divide dataframe according to month.

In [None]:
# Sort index
sorted_tweets = english_tweets.sort_index()

In [None]:
# Check first and last date
print(sorted_tweets.index[0]) # June 2017
print(sorted_tweets.index[-1]) # December 2017 (very incomplete)

In [None]:
# Create column that contains the month of the tweets
sorted_tweets["month"] = sorted_tweets.index.month

In [None]:
# Split dataframe according to month
june = sorted_tweets.loc["2017-06-01":"2017-06-30"]
july = sorted_tweets.loc["2017-07-01":"2017-07-31"]
august = sorted_tweets.loc["2017-08-01":"2017-08-31"]
september = sorted_tweets.loc["2017-09-01":"2017-09-30"]
october = sorted_tweets.loc["2017-10-01":"2017-10-31"]
november = sorted_tweets.loc["2017-11-01":"2017-11-30"]
december = sorted_tweets.loc["2017-12-01":"2017-12-31"]

In [None]:
# Count number of tweets per month
len(june)

In [None]:
# Merge all tweets from each month and treat them as one document respectively
months_combined = english_tweets[["text_clean","month"]].groupby("month")["text_clean"].apply(lambda x: "".join(x))
documents_month_pooling = months_combined.tolist()

In [None]:
# Check how many documents
len(documents_month_pooling) # should be 7

In [None]:
# Check order of documents
months_combined.index # Aug, Dec, Jul, Jun, Nov, Oct, Sep

In [None]:
# Count number of characters
len(documents_month_pooling[6])

In [None]:
# Convert strings to wordlist
aug_words = documents_month_pooling[0].split()
dec_words = documents_month_pooling[1].split()
jul_words = documents_month_pooling[2].split()
jun_words = documents_month_pooling[3].split()
nov_words = documents_month_pooling[4].split()
oct_words = documents_month_pooling[5].split()
sept_words = documents_month_pooling[6].split()

In [None]:
# Count number of words per month
print(len(jun_words))
print(len(jul_words))
print(len(aug_words))
print(len(sept_words))
print(len(oct_words))
print(len(nov_words))
print(len(dec_words))

In [None]:
# Count unique words
print(len(set(jun_words)))
print(len(set(jul_words)))
print(len(set(aug_words)))
print(len(set(sept_words)))
print(len(set(oct_words)))
print(len(set(nov_words)))
print(len(set(dec_words)))

In [None]:
# Merge all tweets from one district and one month and treat them as one document respectively
districts_per_month_combined = sorted_tweets[["text_clean","month","id_distrito"]].groupby(["month","id_distrito"])["text_clean"].apply(lambda x: "".join(x))
documents_district_per_month_pooling = districts_per_month_combined.tolist()

In [None]:
# Check order of documents
districts_per_month_combined.index 

In [None]:
# Check how many documents
len(documents_district_per_month_pooling)

In [None]:
# Inspect dataframe
districts_per_month_combined

### Preprocess Documents for NMF Topic Modeling Method

In [None]:
# Create copy of no pooling documents
nmf_documents = list(documents)

In [None]:
# Transform to lower case
for doc_idx, doc in enumerate(nmf_documents):
    nmf_documents[doc_idx] = nmf_documents[doc_idx].lower()

In [None]:
# Delete stopwords
for doc_idx, doc in enumerate(nmf_documents):
    nmf_documents[doc_idx] = doc.replace(" year ", " ").replace(" to ", " ").replace(" on ", " ").replace(" wa ", " ").replace(" #yourup ", " ").replace(" de ", " ").replace(" just ", " ").replace(" posted ", " ").replace(" photo ", " ").replace(" la ", " ").replace(" del ", " ").replace(" en ", " ").replace(" los ", " ").replace(" el ", " ").replace(" las ", " ").replace(" barcelona ", " ").replace(" #bcn ", " ").replace(" just ", " ").replace(" cada ", " ").replace(" nuestra ", " ").replace(" around ", " ").replace(" spanish ", " ").replace(" día ", " ").replace(" dia ", " ").replace(" #photo ", " ").replace(" first ", " ").replace(" thing ", " ").replace(" last ", " ").replace(" #spain ", " ").replace(" carrer ", " ").replace(" make ", " ").replace(" &lt ", " ").replace(" &gt ", " ").replace(" de ", " ").replace(" for ", " ").replace(" a ", " ").replace(" of ", " ").replace(" the ", " ").replace(" and ", " ").replace(" to ", " ").replace(" in ", " ").replace(" at ", " ").replace(" by ", " ").replace(" one ", " ").replace(" day ", " ").replace(" get ", " ").replace(" españa ", " ").replace(" #españa ", " ").replace(" #repost ", " ").replace(" since ", " ").replace(" still ", " ").replace(" never ", " ").replace(" thank ", " ").replace(" two ", " ").replace(" think ", " ").replace(" could ", " ").replace(" many ", " ").replace(" even ", " ").replace(" the ", " ").replace(" igers ", " ").replace(" que ", " ").replace(" many ", " ").replace(" con ", " ").replace(" un ", " ").replace(" wa ", " ").replace(" bcn ", " ").replace(" d'horta ", " ").replace(" ever ", " ").replace(" come ", " ").replace(" #ig ", " ").replace(" el ", " ").replace(" i'm ", " ").replace(" i've ", " ").replace(" always ", " ").replace(" le ", " ").replace(" what's ", " ").replace(" #barcelone ", " ").replace(" like ", " ").replace(" last ", " ").replace(" back ", " ").replace(" thanks ", " ").replace(" #barna ", " ").replace(" spain ", " ").replace(" yo ", " ").replace(" #yo ", " ").replace(" el ", " ").replace(" #el ", " ").replace(" barcelona ", " ").replace(" #barcelona ", " ")    

In [None]:
# Display preprocessed documents
nmf_documents

### Save Training and Test Documents

In [None]:
with io.open("../outputs/documents.txt", "w", encoding="utf-8") as f:
    for item in documents:
        f.write(item + "\n")
with io.open("../outputs/documents_user_pooling.txt", "w", encoding="utf-8") as f:
    for item in documents_user_pooling:
        f.write(item + "\n")
with io.open("../outputs/documents_hashtag_pooling.txt", "w", encoding="utf-8") as f:
    for item in documents_hashtag_pooling:
        f.write(item + "\n")
with io.open("../outputs/documents_district_pooling.txt", "w", encoding="utf-8") as f:
    for item in documents_district_pooling:
        f.write(item + "\n")
with io.open("../outputs/documents_month_pooling.txt", "w", encoding="utf-8") as f:
    for item in documents_month_pooling:
        f.write(item + "\n")
with io.open("../outputs/documents_district_per_month_pooling.txt", "w", encoding="utf-8") as f:
    for item in documents_district_per_month_pooling:
        f.write(item + "\n")
        
with open("../outputs/documents_no_pooling.p", "wb") as fp:
    pickle.dump(documents, fp)
    
with open("../outputs/documents_user_pooling.p", "wb") as fp:
    pickle.dump(documents_user_pooling, fp)
    
with open("../outputs/documents_hashtag_pooling.p", "wb") as fp:
    pickle.dump(documents_hashtag_pooling, fp)
    
with open("../outputs/nmf_documents_no_pooling.p", "wb") as fp:
    pickle.dump(nmf_documents, fp)

In [None]:
nmf_documents

### Tokenize Training Documents

In [None]:
# We can simply tokenize by space thanks to the previous preprocessing
texts_no_pooling = [[word for word in document.lower().split()]
          for document in documents]

texts_user_pooling = [[word for word in document.lower().split()]
          for document in documents_user_pooling]

texts_hashtag_pooling = [[word for word in document.lower().split()]
          for document in documents_hashtag_pooling]

### Save Unpreprocessed Tokenized Training Documents

In [None]:
with open("../outputs/tokenized_documents_no_pooling_unpp.p", "wb") as fp:
    pickle.dump(texts_no_pooling, fp)
    
with open("../outputs/tokenized_documents_user_pooling_unpp.p", "wb") as fp:
    pickle.dump(texts_user_pooling, fp)
    
with open("../outputs/tokenized_documents_hashtag_pooling_unpp.p", "wb") as fp:
    pickle.dump(texts_hashtag_pooling, fp)

### Further Preprocessing of Training Documents after Tokenization

In [None]:
# Remove numbers, but not words that contain numbers.
texts_no_pooling = [[token for token in doc if not token.isnumeric()] for doc in texts_no_pooling]
texts_user_pooling = [[token for token in doc if not token.isnumeric()] for doc in texts_user_pooling]
texts_hashtag_pooling = [[token for token in doc if not token.isnumeric()] for doc in texts_hashtag_pooling]

In [None]:
# Remove words that are only one character.
texts_no_pooling = [[token for token in doc if len(token) > 1] for doc in texts_no_pooling]
texts_user_pooling = [[token for token in doc if len(token) > 1] for doc in texts_user_pooling]
texts_hashtag_pooling = [[token for token in doc if len(token) > 1] for doc in texts_hashtag_pooling]

In [None]:
# Lemmatize all words in all documents.
lemmatizer = WordNetLemmatizer()
texts_no_pooling = [[lemmatizer.lemmatize(token) for token in doc] for doc in texts_no_pooling]
texts_user_pooling = [[lemmatizer.lemmatize(token) for token in doc] for doc in texts_user_pooling]
texts_hashtag_pooling = [[lemmatizer.lemmatize(token) for token in doc] for doc in texts_hashtag_pooling]

In [None]:
# # ignore this part! computing bigrams did not improve models but made them worse!

# # compute bigrams
# # add bigrams and trigrams to docs (only ones that appear 5 times or more)
# bigram = Phrases(texts_no_pooling, min_count=10)
# for idx in range(len(texts_no_pooling)):
#     for token in bigram[texts_no_pooling[idx]]:
#         if '_' in token:
#             # Token is a bigram, add to document.
#             texts_no_pooling[idx].append(token)
#             
# bigram = Phrases(texts_user_pooling, min_count=10)
# for idx in range(len(texts_user_pooling)):
#     for token in bigram[texts_user_pooling[idx]]:
#         if '_' in token:
#             # Token is a bigram, add to document.
#             texts_user_pooling[idx].append(token)
#             
# bigram = Phrases(texts_hashtag_pooling, min_count=10)
# for idx in range(len(texts_hashtag_pooling)):
#     for token in bigram[texts_hashtag_pooling[idx]]:
#         if '_' in token:
#             # Token is a bigram, add to document.
#             texts_hashtag_pooling[idx].append(token)

## Save Tokenized Training Documents

In [None]:
with open("../outputs/tokenized_documents_no_pooling.p", "wb") as fp:
    pickle.dump(texts_no_pooling, fp)
    
with open("../outputs/tokenized_documents_user_pooling.p", "wb") as fp:
    pickle.dump(texts_user_pooling, fp)
    
with open("../outputs/tokenized_documents_hashtag_pooling.p", "wb") as fp:
    pickle.dump(texts_hashtag_pooling, fp)

## Refine and Vectorize Corpora

In [None]:
# Define function to refine and vectorize corpus 
# (remove stopwords, very frequent and very infrequent words etc.)

# Define stopwords
stpwords = "for a of the and to in at by one #yo #el day get españa #yourup #españa #repost yo el since still never thank two think could many even ha igers th que con un wa bcn d'horta ever come #ig el i'm i've always le what's #barcelone like last back thanks #barna spain barcelona #barcelona cada nuestra around spanish día dia #photo first thing last #spain carrer make &lt &gt de la del en las barcelona #bcn just posted photo year wa".split()

def nltk_stopwords():
    return set(nltk.corpus.stopwords.words("english"))

def prep_corpus(docs, 
                additional_stopwords=set(stpwords),
                no_below=2, no_above=0.5,
                dictionary_name="tourism.dict", corpus_name="tourism.mm"):
    print("Building dictionary...")
    dictionary = corpora.Dictionary(docs)
    stopwords = nltk_stopwords().union(additional_stopwords)
    stopword_ids = map(dictionary.token2id.get, stopwords)
    dictionary.filter_tokens(stopword_ids)
    dictionary.compactify()
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
    dictionary.compactify()
    dictionary.save(dictionary_name)  # store the dictionary, for future reference
    
    print("Building corpus...")
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    corpora.MmCorpus.serialize(corpus_name, corpus)  # store to disk, for later use
    
    return (corpus, dictionary)

In [None]:
# Run function to vectorize corpora
corpus_no_pooling = prep_corpus(texts_no_pooling,
                                dictionary_name="../outputs/tourism_no_pooling.dict",
                                corpus_name="../outputs/tourism_no_pooling.mm")[0]
dictionary_no_pooling = prep_corpus(texts_no_pooling,
                                    dictionary_name="../outputs/tourism_no_pooling.dict",
                                    corpus_name="../outputs/tourism_no_pooling.mm")[1]

corpus_user_pooling = prep_corpus(texts_user_pooling,
                                  dictionary_name="../outputs/tourism_user_pooling.dict",
                                  corpus_name="../outputs/tourism_user_pooling.mm")[0]
dictionary_user_pooling = prep_corpus(texts_user_pooling,
                                      dictionary_name="../outputs/tourism_user_pooling.dict",
                                      corpus_name="../outputs/tourism_user_pooling.mm")[1]

corpus_hashtag_pooling = prep_corpus(texts_hashtag_pooling,
                                     dictionary_name="../outputs/tourism_hashtag_pooling.dict",
                                     corpus_name="../outputs/tourism_hashtag_pooling.mm")[0]
dictionary_hashtag_pooling = prep_corpus(texts_hashtag_pooling,
                                         dictionary_name="../outputs/tourism_hashtag_pooling.dict",
                                         corpus_name="../outputs/tourism_hashtag_pooling.mm")[1]

## Apply Function to Preprocess Test Documents (Before Testing Them with Topic Models)

This function has to include all the same steps that were applied to the training documents!

In [None]:
# Define function
def preprocess(docs):
    ''' Conduct all preprocessing steps that are conducted to train the LDA model'''
    
    # Tokenize documents
    tokenized = [[word for word in document.lower().split()]
          for document in docs]
    
    # Remove words that are only one character
    tokenized = [[token for token in doc if len(token) > 1] for doc in tokenized]
    
    # Lemmatize all words
    lemmatizer = WordNetLemmatizer()
    lemmatized = [[lemmatizer.lemmatize(token) for token in doc] for doc in tokenized]
    
    # Define stopwords
    stpwords = "for a of the and to in at by one #yo #el day get #yourup españa #españa #repost yo el since still never thank two think could many even ha igers th que con un wa bcn d'horta ever come #ig el i'm i've always le what's #barcelone like last back thanks #barna spain barcelona #barcelona cada nuestra around spanish día dia #photo first thing last #spain carrer make &lt &gt de la del en las barcelona #bcn just posted photo year wa".split()
    
    # Get stopwords from nltk
    def nltk_stopwords():
        return set(nltk.corpus.stopwords.words("english"))

    # Combine stopwords
    stopwords = nltk_stopwords().union(stpwords)
    
    # Remove stopwords
    preprocessed = [[token for token in document if token not in stopwords] for document in lemmatized]
    
    return preprocessed

In [None]:
# Apply function to test documents
texts_district_pooling = preprocess(documents_district_pooling)
texts_month_pooling = preprocess(documents_month_pooling)
texts_district_per_month_pooling = preprocess(documents_district_per_month_pooling)

## Save Preprocessed Test Documents

In [None]:
with open("../outputs/tokenized_documents_district_pooling.p", "wb") as fp:
    pickle.dump(texts_district_pooling, fp)
    
with open("../outputs/tokenized_documents_month_pooling.p", "wb") as fp:
    pickle.dump(texts_month_pooling, fp)
    
with open("../outputs/tokenized_documents_district_per_month_pooling.p", "wb") as fp:
    pickle.dump(texts_district_per_month_pooling, fp)

In [None]:
# # Ignore this part! just example code!

# # map tokens to ids
# print(dictionary_no_pooling.token2id)
# print(dictionary_user_pooling.token2id)
# print(dictionary_hashtag_pooling.token2id)

In [None]:
# # Ignore this part! just example code!

# # convert new document to vector 
# new_doc = "Sagrada Familia is amazing"
# new_vec_no_pooling = dictionary_no_pooling.doc2bow(new_doc.lower().split())
# print(new_vec_no_pooling)

In [None]:
# # Ignore this part! not needed for dataset!

# # corpus streaming: one document at a time
# class MyCorpus(object):
#     def __iter__(self):
#         for line in open("corpus_no_pooling.txt"):
#             # assume there's one document per line, tokens separated by whitespace
#             yield dictionary.doc2bow(line.lower().split())
#             
# corpus_memory_friendly = MyCorpus()  # doesn't load the corpus into memory!
# print(corpus_memory_friendly)
# 
# for vector in corpus_memory_friendly:  # load one vector into memory at a time
#     print(vector)