## References

In [2]:
# https://radimrehurek.com/gensim/tut1.html
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

## Prepare Notebook

In [3]:
# import packages
from gensim import corpora
import pandas as pd
import logging
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Phrases
import io
import pickle



In [4]:
# download stopwords and lemmatizer from nltk package
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to C:\Users\Sebastian
[nltk_data]     Birk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Sebastian
[nltk_data]     Birk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# log events
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Load and Inspect Dataset

In [6]:
# read data with timestamp as index
tweets = pd.read_csv("tweets.csv", encoding="latin1", parse_dates=True, 
                     index_col="created", usecols=range(1,28))

In [8]:
# inspect dataframe
tweets.head()

Unnamed: 0_level_0,text,favoriteCount,replyToSN,truncated,replyToSID,replyToUID,statusSource,retweetCount,longitude,latitude,...,retweetCountOutlier,tweetcount,movement,language3,dayofweek,weeknumber,month,idBarrio_xy,idBarrio,user
created,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-11-28 22:44:07,I'm at El Raval in Barcelona https://t.co/bSGA...,0,,False,,,"<a href=""http://foursquare.com"" rel=""nofollow""...",0,2.168964,41.380936,...,0,1,1.0,OTHER,Tuesday,48,November,1,55,u03883
2017-11-22 19:48:53,<ed><U+00A0><U+00BC><ed><U+00B6><U+0098> @ O't...,0,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">...",0,2.16818,41.381031,...,0,2,1.0,OTHER,Wednesday,47,November,1,55,u02046
2017-11-21 21:58:48,Aquesta setmana publiquem una nova escapada al...,1,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">...",1,2.168721,41.380217,...,0,1,1.0,CATALAN,Tuesday,47,November,1,55,u03884
2017-11-20 11:15:10,I'm at El Raval in Barcelona https://t.co/xz2A...,0,,False,,,"<a href=""http://foursquare.com"" rel=""nofollow""...",0,2.168964,41.380936,...,0,4,1.0,OTHER,Monday,47,November,1,55,u00881
2017-11-20 10:08:51,Hablan catalán y es importante destacar que el...,0,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">...",0,2.16818,41.381031,...,0,2,1.0,SPANISH,Monday,47,November,1,55,u02047


In [9]:
# display dataframe info
tweets.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 23778 entries, 2017-11-28 22:44:07 to 2017-06-11 15:55:42
Data columns (total 26 columns):
text                    23778 non-null object
favoriteCount           23778 non-null int64
replyToSN               821 non-null object
truncated               23778 non-null bool
replyToSID              574 non-null float64
replyToUID              821 non-null float64
statusSource            23778 non-null object
retweetCount            23778 non-null int64
longitude               23778 non-null float64
latitude                23778 non-null float64
id_seccion              23778 non-null int64
horaPeticion            23778 non-null object
id_distrito             23778 non-null int64
grupoHora               23778 non-null object
id_seccion_xy           23778 non-null int64
favoriteCountOutlier    23778 non-null int64
retweetCountOutlier     23778 non-null int64
tweetcount              23778 non-null int64
movement                23778 non-null f

In [10]:
# describe dataframe 
tweets.describe()

Unnamed: 0,favoriteCount,replyToSID,replyToUID,retweetCount,longitude,latitude,id_seccion,id_distrito,id_seccion_xy,favoriteCountOutlier,retweetCountOutlier,tweetcount,movement,weeknumber,idBarrio_xy,idBarrio
count,23778.0,574.0,821.0,23778.0,23778.0,23778.0,23778.0,23778.0,23778.0,23778.0,23778.0,23778.0,23778.0,23778.0,23778.0,23778.0
mean,1.200774,9.015351e+17,4.078693e+16,0.285222,2.171064,41.395325,801904000.0,801903.985953,799947600.0,0.000673,0.000126,17.686349,0.690218,34.522962,18.136008,21.180503
std,20.403217,5.293076e+16,1.808261e+17,4.610914,0.02119,0.014936,3028.198,3.023254,39557360.0,0.025932,0.011232,62.044925,0.326073,7.046783,21.415965,22.107374
min,0.0,1.229677e+17,780290.0,0.0,2.059243,41.33258,801901000.0,801901.0,0.0,0.0,0.0,1.0,0.1,23.0,0.0,1.0
25%,0.0,8.862349e+17,119547900.0,0.0,2.15972,41.38278,801902000.0,801902.0,801902000.0,0.0,0.0,1.0,0.384615,28.0,6.0,6.0
50%,0.0,9.079774e+17,353792800.0,0.0,2.174778,41.39525,801902100.0,801902.0,801902100.0,0.0,0.0,3.0,0.75,34.0,7.0,9.0
75%,1.0,9.232741e+17,1028215000.0,0.0,2.176944,41.40408,801906000.0,801906.0,801905000.0,0.0,0.0,8.0,1.0,40.0,25.0,31.0
max,2449.0,9.354775e+17,9.290872e+17,567.0,2.22662,41.46559,801910200.0,801910.0,801910200.0,1.0,1.0,440.0,1.0,49.0,73.0,73.0


In [11]:
# divide dataset according to language: extract english language
english_tweets = tweets[tweets["language3"] == "ENGLISH"].copy()

## Data Cleaning and Preparation

### Preprocess twitter text

In [12]:
# remove links
english_tweets["text_clean"] = english_tweets["text"].str.replace(r"http\S+", "")
# remove emoticons
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"<.*>", "")
# remove @s
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"@", "")
# remove punctuation and special characters
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"&amp", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\.", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\,", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\;", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\-", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\"", "")

In [13]:
# reorder columns
english_tweets.columns
cols = ['text', 'text_clean', 'favoriteCount', 'replyToSN', 'truncated', 'replyToSID',
       'replyToUID', 'statusSource', 'retweetCount', 'longitude', 'latitude',
       'id_seccion', 'horaPeticion', 'id_distrito', 'grupoHora',
       'id_seccion_xy', 'favoriteCountOutlier', 'retweetCountOutlier',
       'tweetcount', 'movement', 'language3', 'dayofweek', 'weeknumber',
       'month', 'idBarrio_xy', 'idBarrio', 'user']

english_tweets = english_tweets[cols]

#### In order to train the topic models, 3 different pooling methods for the creation of documents are used.

### LDA Training Documents Option 1 (No Pooling)

In [14]:
# treat every tweet as a document (no pooling)
documents = english_tweets["text_clean"].tolist()

### LDA Training Documents Option 2 (User Pooling)

In [15]:
# treat all tweets by one user as a document (user pooling)
user_combined = english_tweets[["text_clean","user"]].groupby("user")["text_clean"].apply(lambda x: "".join(x))
documents_user_pooling = user_combined.tolist()

### LDA Training Documents Option 3 (Hashtag Pooling)

In [16]:
# treat all tweets with same hashtags as a document (hashtag pooling)

# find all hashtags
english_tweets["hashtags"] = english_tweets["text_clean"].str.findall(r'#.*?(?=\s|$)')

# separate hashtags in columns
hashtags_tweets = pd.DataFrame(english_tweets["hashtags"].tolist(),
                               columns=["hashtag1", "hashtag2", "hashtag3", "hashtag4",
                                        "hashtag5", "hashtag6", "hashtag7", "hashtag8",
                                        "hashtag9", "hashtag10", "hashtag11", "hashtag12",
                                        "hashtag13"])

# join hashtags with tweet text
hashtags_tweets.index = english_tweets.index
hashtags_tweets = english_tweets.join(hashtags_tweets)

In [17]:
# create one dataframe with text for each hashtag column and save them in a dictionary
dict = {}
for index, item in enumerate(["hash1", "hash2", "hash3", "hash4", "hash5",
                              "hash6", "hash7", "hash8", "hash9", "hash10",
                              "hash11", "hash12", "hash13"]):
    dict[item] = hashtags_tweets[["hashtag" + str(index + 1), "text_clean"]].copy()
    dict[item].columns = ["hashtag", "text"]
    dict[item].dropna(inplace=True)

# concatenate all dataframes to one dataframe (the result is a dataframe
# where there is text for each hashtag found)
hashtags = pd.DataFrame()
for item in dict:
    hashtags = pd.concat([hashtags, dict[item]])

# combine text for each hashtag
hashtags_combined = hashtags.groupby("hashtag")["text"].apply(lambda x: "".join(x))

In [18]:
# remove some generic hashtags that cover a lot of different topics
hashtags_combined.drop(["#Barcelona", "#Catalunya", "#Spain", "#BCN", "#BARCELONA",
                        "#Espana", "#BarcelonaSpain"], inplace=True)

In [19]:
# create documents
documents_hashtag_pooling = hashtags_combined.tolist()

### Prepare Test Documents

#### The first objective of the research is to analyze the distribution of topics over the districts. For this purpose, district pooling is used to create the documents that will be tested.

In [20]:
# merge all tweets from each district (district pooling) and treat them as one document respectively
district_combined = english_tweets[["text_clean","id_distrito"]].groupby("id_distrito")["text_clean"].apply(lambda x: "".join(x))
documents_district_pooling = district_combined.tolist()

In [45]:
# check documents
district_combined

id_distrito
801901    #cure ( Betty Ford's in Barcelona) Sunday fund...
801902    Barcelona Plaza de España #usk #urbansketchers...
801903    Get up work [enjoy] sleep repeat #workroutine ...
801904    Last Tuesday hyde_club honored to play leocost...
801905    #poblenou #barcelona #gentrification #gentrifi...
801906    Saturday back to bocacocktaillounge from 23:00...
801907    Just posted a photo  Barri de Gracia Barcelona...
801908    Brew Pub to try a few of the 30 beers on offer...
801909    I love my school is on fire! Fuego! Mantenlo p...
801910    LAST DAY IN BARCELONA | Getting some sun some ...
Name: text_clean, dtype: object

#### The second objective is to look at the dynamic topic development over time. For this purpose, the dataset is divided according to time and documents are created on this basis.

#### Divide dataframe according to month

In [21]:
# sort index
sorted_tweets = english_tweets.sort_index()

In [22]:
# check first and last date
print(sorted_tweets.index[0]) # June 2017
print(sorted_tweets.index[-1]) # December 2017

2017-06-11 13:46:35
2017-12-04 21:20:25


In [23]:
# create column that contains the month of the tweets
sorted_tweets['month'] = sorted_tweets.index.month

In [24]:
# # ignore this part!

# # split dataframe according to month
# june = sorted_tweets.loc['2017-06-01':'2017-06-30']
# july = sorted_tweets.loc['2017-07-01':'2017-07-31']
# august = sorted_tweets.loc['2017-08-01':'2017-08-31']
# september = sorted_tweets.loc['2017-09-01':'2017-09-30']
# october = sorted_tweets.loc['2017-10-01':'2017-10-31']
# november = sorted_tweets.loc['2017-11-01':'2017-11-30']
# december = sorted_tweets.loc['2017-12-01':'2017-12-31']

In [25]:
# merge all tweets from each month and treat them as one document respectively
months_combined = english_tweets[["text_clean","month"]].groupby("month")["text_clean"].apply(lambda x: "".join(x))
documents_month_pooling = months_combined.tolist()

In [26]:
# check how many documents
len(documents_month_pooling) # should be 7

7

In [27]:
# merge all tweets from one district and one month and treat them as one document respectively
districts_per_month_combined = sorted_tweets[["text_clean","month","id_distrito"]].groupby(["month","id_distrito"])["text_clean"].apply(lambda x: "".join(x))
documents_district_per_month_pooling = districts_per_month_combined.tolist()

In [28]:
# check how many documents
len(documents_district_per_month_pooling)

62

In [29]:
# inspect dataframe
districts_per_month_combined

month  id_distrito
6      801901         Boy don't hurt your brain  Gothic Quarter Barc...
       801902         I was moved  inspired  My life will never be t...
       801903         CARAVANA #casualcomposition #waytosee #francba...
       801904         JoelJoanJuveMorning concepts #everisdesign #de...
       801905         Lots of walks some hikes sightseeing Sangria f...
       801906         I like the shadow and light in this #iphone pi...
       801907         Just posted a photo  Parc del Laberint d'Horta...
       801908         Ella y el Agua! #karla #bcn #AJ #love #rocknro...
       801909         Conmuting to work ( METRO Navas  tmb_barcelona...
       801910          seeing him live was a religious experience #k...
7      801901         More #beautifulmemories from #barcelona #artdi...
       801902         Frederic Amat La instalación Zoótropo #welcome...
       801903          Look for the noise hidden in silence the move...
       801904         Just posted a photo  Pa

### Save Training and Test Documents

In [30]:
with io.open('documents.txt', 'w', encoding='utf-8') as f:
    for item in documents:
        f.write(item)
with io.open('documents_user_pooling.txt', 'w', encoding='utf-8') as f:
    for item in documents_user_pooling:
        f.write(item)
with io.open('documents_hashtag_pooling.txt', 'w', encoding='utf-8') as f:
    for item in documents_hashtag_pooling:
        f.write(item)
with io.open('documents_district_pooling.txt', 'w', encoding='utf-8') as f:
    for item in documents_district_pooling:
        f.write(item)
with io.open('documents_month_pooling.txt', 'w', encoding='utf-8') as f:
    for item in documents_month_pooling:
        f.write(item)
with io.open('documents_district_per_month_pooling.txt', 'w', encoding='utf-8') as f:
    for item in documents_district_per_month_pooling:
        f.write(item)

### Tokenize Training Documents

In [31]:
texts_no_pooling = [[word for word in document.lower().split()]
          for document in documents]

texts_user_pooling = [[word for word in document.lower().split()]
          for document in documents_user_pooling]

texts_hashtag_pooling = [[word for word in document.lower().split()]
          for document in documents_hashtag_pooling]

### Further Preprocessing of Training Documents after Tokenization

In [32]:
# remove numbers, but not words that contain numbers.
texts_no_pooling = [[token for token in doc if not token.isnumeric()] for doc in texts_no_pooling]
texts_user_pooling = [[token for token in doc if not token.isnumeric()] for doc in texts_user_pooling]
texts_hashtag_pooling = [[token for token in doc if not token.isnumeric()] for doc in texts_hashtag_pooling]

In [33]:
# remove words that are only one character.
texts_no_pooling = [[token for token in doc if len(token) > 1] for doc in texts_no_pooling]
texts_user_pooling = [[token for token in doc if len(token) > 1] for doc in texts_user_pooling]
texts_hashtag_pooling = [[token for token in doc if len(token) > 1] for doc in texts_hashtag_pooling]

In [34]:
# lemmatize all words in all documents.
lemmatizer = WordNetLemmatizer()
texts_no_pooling = [[lemmatizer.lemmatize(token) for token in doc] for doc in texts_no_pooling]
texts_user_pooling = [[lemmatizer.lemmatize(token) for token in doc] for doc in texts_user_pooling]
texts_hashtag_pooling = [[lemmatizer.lemmatize(token) for token in doc] for doc in texts_hashtag_pooling]

In [35]:
# # ignore this part! computing bigrams did not improve models but made them worse!

# # compute bigrams
# # add bigrams and trigrams to docs (only ones that appear 5 times or more)
# bigram = Phrases(texts_no_pooling, min_count=10)
# for idx in range(len(texts_no_pooling)):
#     for token in bigram[texts_no_pooling[idx]]:
#         if '_' in token:
#             # Token is a bigram, add to document.
#             texts_no_pooling[idx].append(token)
#             
# bigram = Phrases(texts_user_pooling, min_count=10)
# for idx in range(len(texts_user_pooling)):
#     for token in bigram[texts_user_pooling[idx]]:
#         if '_' in token:
#             # Token is a bigram, add to document.
#             texts_user_pooling[idx].append(token)
#             
# bigram = Phrases(texts_hashtag_pooling, min_count=10)
# for idx in range(len(texts_hashtag_pooling)):
#     for token in bigram[texts_hashtag_pooling[idx]]:
#         if '_' in token:
#             # Token is a bigram, add to document.
#             texts_hashtag_pooling[idx].append(token)

## Save Tokenized Training Documents

In [36]:
with open('tokenized_documents_no_pooling.p', 'wb') as fp:
    pickle.dump(texts_no_pooling, fp)
    
with open('tokenized_documents_user_pooling.p', 'wb') as fp:
    pickle.dump(texts_user_pooling, fp)
    
with open('tokenized_documents_hashtag_pooling.p', 'wb') as fp:
    pickle.dump(texts_hashtag_pooling, fp)

## Refine and Vectorize Corpora

In [37]:
# define function to refine and vectorize corpus 
# (remove stopwords, very frequent and very infrequent words etc.)

# define stopwords
stpwords = 'for a of the and to in at by spain barcelona #barcelona #spain de la del en las "barcelona #bcn'.split()

def nltk_stopwords():
    return set(nltk.corpus.stopwords.words('english'))

def prep_corpus(docs, 
                additional_stopwords=set(stpwords),
                no_below=2, no_above=0.5,
                dictionary_name='tourism.dict', corpus_name='tourism.mm'):
    print('Building dictionary...')
    dictionary = corpora.Dictionary(docs)
    stopwords = nltk_stopwords().union(additional_stopwords)
    stopword_ids = map(dictionary.token2id.get, stopwords)
    dictionary.filter_tokens(stopword_ids)
    dictionary.compactify()
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
    dictionary.compactify()
    dictionary.save(dictionary_name)  # store the dictionary, for future reference
    
    print('Building corpus...')
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    corpora.MmCorpus.serialize(corpus_name, corpus)  # store to disk, for later use
    
    return (corpus, dictionary)

In [38]:
# run function to vectorize corpora
corpus_no_pooling = prep_corpus(texts_no_pooling,
                                dictionary_name="tourism_no_pooling.dict",
                                corpus_name="tourism_no_pooling.mm")[0]
dictionary_no_pooling = prep_corpus(texts_no_pooling,
                                    dictionary_name="tourism_no_pooling.dict",
                                    corpus_name="tourism_no_pooling.mm")[1]

corpus_user_pooling = prep_corpus(texts_user_pooling,
                                    dictionary_name="tourism_user_pooling.dict",
                                    corpus_name="tourism_user_pooling.mm")[0]
dictionary_user_pooling = prep_corpus(texts_user_pooling,
                                    dictionary_name="tourism_user_pooling.dict",
                                    corpus_name="tourism_user_pooling.mm")[1]

corpus_hashtag_pooling = prep_corpus(texts_hashtag_pooling,
                                    dictionary_name="tourism_hashtag_pooling.dict",
                                    corpus_name="tourism_hashtag_pooling.mm")[0]
dictionary_hashtag_pooling = prep_corpus(texts_hashtag_pooling,
                                    dictionary_name="tourism_hashtag_pooling.dict",
                                    corpus_name="tourism_hashtag_pooling.mm")[1]

2018-09-26 16:37:35,653 : INFO : adding document #0 to Dictionary(0 unique tokens: [])


Building dictionary...


2018-09-26 16:37:35,854 : INFO : built Dictionary(16741 unique tokens: ['#olgodbarcelona', '#triathlontraining', '#ølgod', 'beer', 'brew']...) from 7633 documents (total 79537 corpus positions)
2018-09-26 16:37:36,032 : INFO : discarding 11562 tokens: [('#olgodbarcelona', 1), ('#triathlontraining', 1), ('quickie', 1), ('#hadthemostmusselsinmylife', 1), ('#tripoftheyear', 1), ('feast!', 1), ('freshest', 1), ('hail', 1), ('2h:27m:58s', 1), ('mallan', 1)]...
2018-09-26 16:37:36,033 : INFO : keeping 5035 tokens which were in no less than 2 and no more than 3816 (=50.0%) documents
2018-09-26 16:37:36,046 : INFO : resulting dictionary: Dictionary(5035 unique tokens: ['#ølgod', 'beer', 'brew', 'offer', 'pub']...)
2018-09-26 16:37:36,053 : INFO : saving Dictionary object under tourism_no_pooling.dict, separately None
2018-09-26 16:37:36,061 : INFO : saved tourism_no_pooling.dict
2018-09-26 16:37:36,232 : INFO : storing corpus in Matrix Market format to tourism_no_pooling.mm
2018-09-26 16:37:36

Building corpus...


2018-09-26 16:37:36,318 : INFO : PROGRESS: saving document #2000
2018-09-26 16:37:36,367 : INFO : PROGRESS: saving document #3000
2018-09-26 16:37:36,399 : INFO : PROGRESS: saving document #4000
2018-09-26 16:37:36,425 : INFO : PROGRESS: saving document #5000
2018-09-26 16:37:36,454 : INFO : PROGRESS: saving document #6000
2018-09-26 16:37:36,481 : INFO : PROGRESS: saving document #7000
2018-09-26 16:37:36,499 : INFO : saved 7633x5035 matrix, density=0.108% (41550/38432155)
2018-09-26 16:37:36,502 : INFO : saving MmCorpus index to tourism_no_pooling.mm.index
2018-09-26 16:37:36,507 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-09-26 16:37:36,693 : INFO : built Dictionary(16741 unique tokens: ['#olgodbarcelona', '#triathlontraining', '#ølgod', 'beer', 'brew']...) from 7633 documents (total 79537 corpus positions)


Building dictionary...


2018-09-26 16:37:36,746 : INFO : discarding 11562 tokens: [('#olgodbarcelona', 1), ('#triathlontraining', 1), ('quickie', 1), ('#hadthemostmusselsinmylife', 1), ('#tripoftheyear', 1), ('feast!', 1), ('freshest', 1), ('hail', 1), ('2h:27m:58s', 1), ('mallan', 1)]...
2018-09-26 16:37:36,749 : INFO : keeping 5035 tokens which were in no less than 2 and no more than 3816 (=50.0%) documents
2018-09-26 16:37:36,759 : INFO : resulting dictionary: Dictionary(5035 unique tokens: ['#ølgod', 'beer', 'brew', 'offer', 'pub']...)
2018-09-26 16:37:36,766 : INFO : saving Dictionary object under tourism_no_pooling.dict, separately None
2018-09-26 16:37:36,772 : INFO : saved tourism_no_pooling.dict
2018-09-26 16:37:36,887 : INFO : storing corpus in Matrix Market format to tourism_no_pooling.mm
2018-09-26 16:37:36,890 : INFO : saving sparse matrix to tourism_no_pooling.mm
2018-09-26 16:37:36,891 : INFO : PROGRESS: saving document #0
2018-09-26 16:37:36,910 : INFO : PROGRESS: saving document #1000
2018-09

Building corpus...


2018-09-26 16:37:37,027 : INFO : PROGRESS: saving document #6000
2018-09-26 16:37:37,047 : INFO : PROGRESS: saving document #7000
2018-09-26 16:37:37,067 : INFO : saved 7633x5035 matrix, density=0.108% (41550/38432155)
2018-09-26 16:37:37,071 : INFO : saving MmCorpus index to tourism_no_pooling.mm.index
2018-09-26 16:37:37,081 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-09-26 16:37:37,249 : INFO : built Dictionary(16761 unique tokens: ['castellano', 'catalangov:', 'cataluña', 'consecuencia', 'country']...) from 4424 documents (total 79383 corpus positions)


Building dictionary...


2018-09-26 16:37:37,308 : INFO : discarding 12432 tokens: [('castellano', 1), ('catalangov:', 1), ('consecuencia', 1), ('exclusion', 1), ('instil', 1), ('linguistica', 1), ('mala', 1), ('politica', 1), ('practica', 1), ('shameful', 1)]...
2018-09-26 16:37:37,310 : INFO : keeping 4185 tokens which were in no less than 2 and no more than 2212 (=50.0%) documents
2018-09-26 16:37:37,324 : INFO : resulting dictionary: Dictionary(4185 unique tokens: ['cataluña', 'country', 'e', 'fear', 'police']...)
2018-09-26 16:37:37,328 : INFO : saving Dictionary object under tourism_user_pooling.dict, separately None
2018-09-26 16:37:37,336 : INFO : saved tourism_user_pooling.dict
2018-09-26 16:37:37,428 : INFO : storing corpus in Matrix Market format to tourism_user_pooling.mm
2018-09-26 16:37:37,431 : INFO : saving sparse matrix to tourism_user_pooling.mm
2018-09-26 16:37:37,433 : INFO : PROGRESS: saving document #0
2018-09-26 16:37:37,476 : INFO : PROGRESS: saving document #1000
2018-09-26 16:37:37,50

Building corpus...
Building dictionary...


2018-09-26 16:37:37,739 : INFO : built Dictionary(16761 unique tokens: ['castellano', 'catalangov:', 'cataluña', 'consecuencia', 'country']...) from 4424 documents (total 79383 corpus positions)
2018-09-26 16:37:37,801 : INFO : discarding 12432 tokens: [('castellano', 1), ('catalangov:', 1), ('consecuencia', 1), ('exclusion', 1), ('instil', 1), ('linguistica', 1), ('mala', 1), ('politica', 1), ('practica', 1), ('shameful', 1)]...
2018-09-26 16:37:37,803 : INFO : keeping 4185 tokens which were in no less than 2 and no more than 2212 (=50.0%) documents
2018-09-26 16:37:37,812 : INFO : resulting dictionary: Dictionary(4185 unique tokens: ['cataluña', 'country', 'e', 'fear', 'police']...)
2018-09-26 16:37:37,817 : INFO : saving Dictionary object under tourism_user_pooling.dict, separately None
2018-09-26 16:37:37,822 : INFO : saved tourism_user_pooling.dict
2018-09-26 16:37:37,931 : INFO : storing corpus in Matrix Market format to tourism_user_pooling.mm
2018-09-26 16:37:37,936 : INFO : sa

Building corpus...


2018-09-26 16:37:38,089 : INFO : saved 4424x4185 matrix, density=0.182% (33668/18514440)
2018-09-26 16:37:38,092 : INFO : saving MmCorpus index to tourism_user_pooling.mm.index
2018-09-26 16:37:38,101 : INFO : adding document #0 to Dictionary(0 unique tokens: [])


Building dictionary...


2018-09-26 16:37:38,506 : INFO : built Dictionary(11970 unique tokens: ['#11setembre', '#2017', '#aj', '#aperitivo', '#architecture']...) from 6198 documents (total 148488 corpus positions)
2018-09-26 16:37:38,545 : INFO : discarding 1931 tokens: [('(out', 1), ('10k++)', 1), ("advisor's", 1), ('carot', 1), ('celebration', 1), ('forgotten', 1), ('garden!', 1), ('napa', 1), ('oriental', 1), ('skewer', 1)]...
2018-09-26 16:37:38,547 : INFO : keeping 9904 tokens which were in no less than 2 and no more than 3099 (=50.0%) documents
2018-09-26 16:37:38,563 : INFO : resulting dictionary: Dictionary(9904 unique tokens: ['#11setembre', '#2017', '#aj', '#aperitivo', '#architecture']...)
2018-09-26 16:37:38,576 : INFO : saving Dictionary object under tourism_hashtag_pooling.dict, separately None
2018-09-26 16:37:38,585 : INFO : saved tourism_hashtag_pooling.dict
2018-09-26 16:37:38,757 : INFO : storing corpus in Matrix Market format to tourism_hashtag_pooling.mm
2018-09-26 16:37:38,759 : INFO : s

Building corpus...


2018-09-26 16:37:38,851 : INFO : PROGRESS: saving document #2000
2018-09-26 16:37:38,900 : INFO : PROGRESS: saving document #3000
2018-09-26 16:37:38,941 : INFO : PROGRESS: saving document #4000
2018-09-26 16:37:38,980 : INFO : PROGRESS: saving document #5000
2018-09-26 16:37:39,019 : INFO : PROGRESS: saving document #6000
2018-09-26 16:37:39,025 : INFO : saved 6198x9904 matrix, density=0.138% (84677/61384992)
2018-09-26 16:37:39,027 : INFO : saving MmCorpus index to tourism_hashtag_pooling.mm.index
2018-09-26 16:37:39,039 : INFO : adding document #0 to Dictionary(0 unique tokens: [])


Building dictionary...


2018-09-26 16:37:39,272 : INFO : built Dictionary(11970 unique tokens: ['#11setembre', '#2017', '#aj', '#aperitivo', '#architecture']...) from 6198 documents (total 148488 corpus positions)
2018-09-26 16:37:39,317 : INFO : discarding 1931 tokens: [('(out', 1), ('10k++)', 1), ("advisor's", 1), ('carot', 1), ('celebration', 1), ('forgotten', 1), ('garden!', 1), ('napa', 1), ('oriental', 1), ('skewer', 1)]...
2018-09-26 16:37:39,319 : INFO : keeping 9904 tokens which were in no less than 2 and no more than 3099 (=50.0%) documents
2018-09-26 16:37:39,335 : INFO : resulting dictionary: Dictionary(9904 unique tokens: ['#11setembre', '#2017', '#aj', '#aperitivo', '#architecture']...)
2018-09-26 16:37:39,345 : INFO : saving Dictionary object under tourism_hashtag_pooling.dict, separately None
2018-09-26 16:37:39,355 : INFO : saved tourism_hashtag_pooling.dict
2018-09-26 16:37:39,524 : INFO : storing corpus in Matrix Market format to tourism_hashtag_pooling.mm
2018-09-26 16:37:39,527 : INFO : s

Building corpus...


2018-09-26 16:37:39,624 : INFO : PROGRESS: saving document #2000
2018-09-26 16:37:39,670 : INFO : PROGRESS: saving document #3000
2018-09-26 16:37:39,707 : INFO : PROGRESS: saving document #4000
2018-09-26 16:37:39,750 : INFO : PROGRESS: saving document #5000
2018-09-26 16:37:39,791 : INFO : PROGRESS: saving document #6000
2018-09-26 16:37:39,800 : INFO : saved 6198x9904 matrix, density=0.138% (84677/61384992)
2018-09-26 16:37:39,802 : INFO : saving MmCorpus index to tourism_hashtag_pooling.mm.index


## Apply Function to Preprocess Test Documents (Before Testing Them with LDA)

#### This function has to include all the same steps that were applied to the training documents!

In [39]:
# define function
def preprocess(docs):
    ''' Conduct all preprocessing steps that are conducted to train the LDA model'''
    
    # tokenize documents
    tokenized = [[word for word in document.lower().split()]
          for document in docs]
    
    # remove words that are only one character
    tokenized = [[token for token in doc if len(token) > 1] for doc in tokenized]
    
    # lemmatize all words
    lemmatizer = WordNetLemmatizer()
    lemmatized = [[lemmatizer.lemmatize(token) for token in doc] for doc in tokenized]
    
    # define stopwords
    stpwords = 'for a of the and to in at by spain barcelona #barcelona #spain de la del en las "barcelona #bcn'.split()
    
    # get stopwords from nltk
    def nltk_stopwords():
        return set(nltk.corpus.stopwords.words('english'))

    # combine stopwords
    stopwords = nltk_stopwords().union(stpwords)
    
    # remove stopwords
    preprocessed = [[token for token in document if token not in stopwords] for document in lemmatized]
    
    return preprocessed

In [40]:
# apply function to test documents
texts_district_pooling = preprocess(documents_district_pooling)
texts_month_pooling = preprocess(documents_month_pooling)
texts_district_per_month_pooling = preprocess(documents_district_per_month_pooling)

## Save Preprocessed Test Documents

In [41]:
with open('tokenized_documents_district_pooling.p', 'wb') as fp:
    pickle.dump(texts_district_pooling, fp)
    
with open('tokenized_documents_month_pooling.p', 'wb') as fp:
    pickle.dump(texts_month_pooling, fp)
    
with open('tokenized_documents_district_per_month_pooling.p', 'wb') as fp:
    pickle.dump(texts_district_per_month_pooling, fp)

In [42]:
# # ignore this part!

# # map tokens to ids
# print(dictionary_no_pooling.token2id)
# print(dictionary_user_pooling.token2id)
# print(dictionary_hashtag_pooling.token2id)

In [43]:
# # ignore this part!

# # convert new document to vector 
# new_doc = "Sagrada Familia is amazing"
# new_vec_no_pooling = dictionary_no_pooling.doc2bow(new_doc.lower().split())
# print(new_vec_no_pooling)

In [44]:
# # ignore this part! not needed for dataset!

# # corpus streaming: one document at a time
# class MyCorpus(object):
#     def __iter__(self):
#         for line in open("corpus_no_pooling.txt"):
#             # assume there's one document per line, tokens separated by whitespace
#             yield dictionary.doc2bow(line.lower().split())
#             
# corpus_memory_friendly = MyCorpus()  # doesn't load the corpus into memory!
# print(corpus_memory_friendly)
# 
# for vector in corpus_memory_friendly:  # load one vector into memory at a time
#     print(vector)