## References

In [1]:
# https://radimrehurek.com/gensim/tut1.html
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
# https://docs.python.org/2/library/re.html

## Prepare Notebook

In [3]:
# import packages
from gensim import corpora
import pandas as pd
import logging
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Phrases
import io
import pickle



In [3]:
# download stopwords and lemmatizer from nltk package
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to C:\Users\Sebastian
[nltk_data]     Birk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Sebastian
[nltk_data]     Birk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# log events
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Load and Inspect Dataset

In [5]:
# read data with timestamp as index
tweets = pd.read_csv("tweets.csv", encoding="latin1", parse_dates=True, 
                     index_col="created", usecols=range(1,28))

In [6]:
# inspect dataframe
tweets.head()

Unnamed: 0_level_0,text,favoriteCount,replyToSN,truncated,replyToSID,replyToUID,statusSource,retweetCount,longitude,latitude,...,retweetCountOutlier,tweetcount,movement,language3,dayofweek,weeknumber,month,idBarrio_xy,idBarrio,user
created,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-11-28 22:44:07,I'm at El Raval in Barcelona https://t.co/bSGA...,0,,False,,,"<a href=""http://foursquare.com"" rel=""nofollow""...",0,2.168964,41.380936,...,0,1,1.0,OTHER,Tuesday,48,November,1,55,u03883
2017-11-22 19:48:53,<ed><U+00A0><U+00BC><ed><U+00B6><U+0098> @ O't...,0,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">...",0,2.16818,41.381031,...,0,2,1.0,OTHER,Wednesday,47,November,1,55,u02046
2017-11-21 21:58:48,Aquesta setmana publiquem una nova escapada al...,1,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">...",1,2.168721,41.380217,...,0,1,1.0,CATALAN,Tuesday,47,November,1,55,u03884
2017-11-20 11:15:10,I'm at El Raval in Barcelona https://t.co/xz2A...,0,,False,,,"<a href=""http://foursquare.com"" rel=""nofollow""...",0,2.168964,41.380936,...,0,4,1.0,OTHER,Monday,47,November,1,55,u00881
2017-11-20 10:08:51,Hablan catalán y es importante destacar que el...,0,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">...",0,2.16818,41.381031,...,0,2,1.0,SPANISH,Monday,47,November,1,55,u02047


In [37]:
sorted_tweets

Unnamed: 0_level_0,text,text_clean,favoriteCount,replyToSN,truncated,replyToSID,replyToUID,statusSource,retweetCount,longitude,latitude,id_seccion,horaPeticion,id_distrito,grupoHora,id_seccion_xy,favoriteCountOutlier,retweetCountOutlier,tweetcount,movement,language3,dayofweek,weeknumber,month,idBarrio_xy,idBarrio,user,hashtags
created,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
2017-06-11 13:46:35,<ed><U+00A0><U+00BD><ed><U+00B2><U+0096> of my life @sampha <ed><U+00A0><U+00BD><ed><U+00B2><U+0096> seeing him live was a religious experience #korasings @ Primavera Sound https://t.co/JAyNchzx5t,seeing him live was a religious experience #korasings Primavera Sound,0,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>",0,2.222070,41.410070,801910081,20170621_004813,801910,12-14,801910081,0,0,1,1.000000,ENGLISH,Sunday,23,6,70,70,u10841,[#korasings]
2017-06-11 14:33:40,I was moved &amp; inspired<ed><U+00A0><U+00BD><ed><U+00B9><U+008F>. My life will never be the same #reverence #holiness #belonghere https://t.co/ausgTyWbpv,I was moved inspired My life will never be the same #reverence #holiness #belonghere,0,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>",0,2.174778,41.403840,801902049,20170621_004813,801902,14-16,801902049,0,0,1,1.000000,ENGLISH,Sunday,23,6,6,6,u10854,"[#reverence, #holiness, #belonghere]"
2017-06-11 15:11:03,"Lots of walks, some hikes, sightseeing, Sangria, fresh fish and lots of sun time over. Sad to be https://t.co/1q4dOwWEwD",Lots of walks some hikes sightseeing Sangria fresh fish and lots of sun time over Sad to be,0,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>",0,2.131180,41.405820,801905031,20170621_004813,801905,14-16,801905031,0,0,1,1.000000,ENGLISH,Sunday,23,6,25,25,u10845,[]
2017-06-11 15:55:42,Another for the #Pokedex @ PokemonGo Barcelona https://t.co/IREN21n2x3,Another for the #Pokedex PokemonGo Barcelona,0,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>",0,2.161560,41.398450,801902059,20170621_004813,801902,14-16,801906057,0,0,1,1.000000,ENGLISH,Sunday,23,6,31,7,u10859,[#Pokedex]
2017-06-11 16:25:17,I like the shadow and light in this #iphone pic #barcelona #gaudi #travelpics #nofilter @ Gaudi https://t.co/W2MUGe6Ce5,I like the shadow and light in this #iphone pic #barcelona #gaudi #travelpics #nofilter Gaudi,0,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>",0,2.152162,41.413550,801906018,20170621_004813,801906,16-18,801906018,0,0,1,1.000000,ENGLISH,Sunday,23,6,30,30,u10837,"[#iphone, #barcelona, #gaudi, #travelpics, #nofilter]"
2017-06-11 16:30:29,Hexagonal mosaic designs #Gaudí #sagradafamilia #touristvibes https://t.co/Hf27O7oRoy,Hexagonal mosaic designs #Gaudí #sagradafamilia #touristvibes,0,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>",0,2.174778,41.403840,801902049,20170621_004813,801902,16-18,801902049,0,0,2,1.000000,ENGLISH,Sunday,23,6,6,6,u03855,"[#Gaudí, #sagradafamilia, #touristvibes]"
2017-06-11 18:07:45,Home is where the heart is\n#Barcelona #barcelonaespoderosabarcelonatienepoder #MontjuicAlFondo https://t.co/E8XHhgihmj,Home is where the heart is#Barcelona #barcelonaespoderosabarcelonatienepoder #MontjuicAlFondo,0,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>",0,2.150212,41.399240,801905070,20170621_004813,801905,18-20,801905070,0,0,1,1.000000,ENGLISH,Sunday,23,6,26,26,u10858,"[#Barcelona, #barcelonaespoderosabarcelonatienepoder, #MontjuicAlFondo]"
2017-06-11 20:55:20,Passage obligé <ed><U+00A0><U+00BD><ed><U+00B1><U+0080><ed><U+00A0><U+00BE><ed><U+00B4><U+0093> #catalan #Architecture #Gaudi #workinprogress #7yearslater #lookup #column https://t.co/Xcy0GgnkjE,Passage obligé #catalan #Architecture #Gaudi #workinprogress # yearslater #lookup #column,0,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>",0,2.174778,41.403840,801902049,20170621_004813,801902,20-22,801902049,0,0,1,1.000000,ENGLISH,Sunday,23,6,6,6,u10848,"[#catalan, #Architecture, #Gaudi, #workinprogress, #, #lookup, #column]"
2017-06-11 22:11:18,Just posted a photo @ Sant Martí https://t.co/E5e63U1lKB,Just posted a photo Sant Martí,0,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>",0,2.198681,41.417860,801910120,20170621_004813,801910,22-24,801910119,0,0,1,1.000000,ENGLISH,Sunday,24,6,72,72,u10836,[]
2017-06-12 00:29:22,"Primavera Sound ""Created in Barcelona""\n#music #live #festival #primaverasound https://t.co/nU1BGse9iO",Primavera Sound Created in Barcelona#music #live #festival #primaverasound,1,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>",0,2.222070,41.410070,801910081,20170621_004813,801910,00-02,801910081,0,0,1,1.000000,ENGLISH,Monday,24,6,70,70,u10838,"[#music, #live, #festival, #primaverasound]"


In [7]:
# display dataframe info
tweets.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 23778 entries, 2017-11-28 22:44:07 to 2017-06-11 15:55:42
Data columns (total 26 columns):
text                    23778 non-null object
favoriteCount           23778 non-null int64
replyToSN               821 non-null object
truncated               23778 non-null bool
replyToSID              574 non-null float64
replyToUID              821 non-null float64
statusSource            23778 non-null object
retweetCount            23778 non-null int64
longitude               23778 non-null float64
latitude                23778 non-null float64
id_seccion              23778 non-null int64
horaPeticion            23778 non-null object
id_distrito             23778 non-null int64
grupoHora               23778 non-null object
id_seccion_xy           23778 non-null int64
favoriteCountOutlier    23778 non-null int64
retweetCountOutlier     23778 non-null int64
tweetcount              23778 non-null int64
movement                23778 non-null f

In [8]:
# describe dataframe 
tweets.describe()

Unnamed: 0,favoriteCount,replyToSID,replyToUID,retweetCount,longitude,latitude,id_seccion,id_distrito,id_seccion_xy,favoriteCountOutlier,retweetCountOutlier,tweetcount,movement,weeknumber,idBarrio_xy,idBarrio
count,23778.0,574.0,821.0,23778.0,23778.0,23778.0,23778.0,23778.0,23778.0,23778.0,23778.0,23778.0,23778.0,23778.0,23778.0,23778.0
mean,1.200774,9.015351e+17,4.078693e+16,0.285222,2.171064,41.395325,801904000.0,801903.985953,799947600.0,0.000673,0.000126,17.686349,0.690218,34.522962,18.136008,21.180503
std,20.403217,5.293076e+16,1.808261e+17,4.610914,0.02119,0.014936,3028.198,3.023254,39557360.0,0.025932,0.011232,62.044925,0.326073,7.046783,21.415965,22.107374
min,0.0,1.229677e+17,780290.0,0.0,2.059243,41.33258,801901000.0,801901.0,0.0,0.0,0.0,1.0,0.1,23.0,0.0,1.0
25%,0.0,8.862349e+17,119547900.0,0.0,2.15972,41.38278,801902000.0,801902.0,801902000.0,0.0,0.0,1.0,0.384615,28.0,6.0,6.0
50%,0.0,9.079774e+17,353792800.0,0.0,2.174778,41.39525,801902100.0,801902.0,801902100.0,0.0,0.0,3.0,0.75,34.0,7.0,9.0
75%,1.0,9.232741e+17,1028215000.0,0.0,2.176944,41.40408,801906000.0,801906.0,801905000.0,0.0,0.0,8.0,1.0,40.0,25.0,31.0
max,2449.0,9.354775e+17,9.290872e+17,567.0,2.22662,41.46559,801910200.0,801910.0,801910200.0,1.0,1.0,440.0,1.0,49.0,73.0,73.0


In [9]:
# divide dataset according to language: extract english language
english_tweets = tweets[tweets["language3"] == "ENGLISH"].copy()

## Data Cleaning and Preparation

### Preprocess Twitter Text

In [10]:
# display text
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)

english_tweets["text"]

created
2017-11-18 19:12:28    Brew Pub to try a few of the 30 beers on offer <ed><U+00A0><U+00BD><ed><U+00B8><U+0080> #olgodbarcelona #ølgod #triathlontraining
 https://t.co/X3TK9OnJjs                                                                                                                                                                                                                                         
2017-11-27 10:46:20    "Art is coming face to face with yourself" <ed><U+00A0><U+00BD><ed><U+00B9><U+0083><ed><U+00A0><U+00BD><ed><U+00B9><U+0082> @ Parc de les Tres Xemeneies https://t.co/FedFTbDc6E                                                                                                                                                                                                                   
2017-11-25 20:44:52    Today's quickie @ Parc de les Tres Xemeneies https://t.co/ASKSgYfks8                                                                               

In [11]:
# remove links
english_tweets["text_clean"] = english_tweets["text"].str.replace(r"http\S+", "")
# remove emoticons
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"<.*>", "")
# remove punctuation, special characters etc.
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"&amp", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\.", " ")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\,", " ")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\;", " ")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\-", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\"", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace("\\\\", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace("\/", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace("\*", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"@", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\n", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\|", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"W//", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"!", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"~", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r")", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"(", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"?", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r":", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\+", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\{", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"\}", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"_", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"h:[0-9]+m:[0-9]+s", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"[0-9]+", " ")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"w/", " ")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"[\x97]+", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"[\x96]+", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"[\x95]+", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"[\x94]+", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"[\x93]+", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"[\x92]+", "'")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"[\x91]+", "")
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r"[\x85]+", "")

# reduce white spaces to 1
english_tweets["text_clean"] = english_tweets["text_clean"].str.replace(r" +", " ")

In [12]:
# display cleaned text
english_tweets["text_clean"]

created
2017-11-18 19:12:28    Brew Pub to try a few of the beers on offer #olgodbarcelona #ølgod #triathlontraining                             
2017-11-27 10:46:20    Art is coming face to face with yourself Parc de les Tres Xemeneies                                               
2017-11-25 20:44:52    Today's quickie Parc de les Tres Xemeneies                                                                        
2017-11-23 10:47:18    All hail the freshest seafood in town Let's feast #hadthemostmusselsinmylife #tripoftheyear                       
2017-11-19 22:30:26    Dance to the beat of your own drum #urban #music #live #stage #graffiti #streetart                                
2017-11-19 10:37:25    I just finished walking km in h m s with #Endomondo #endorphins                                                   
2017-11-26 11:24:48    #blackfriday shopping #friends Then #bday party for Mallan great food margaritas and more                         
2017-11-25 18:52:42    Ano

In [13]:
# reorder columns
english_tweets.columns
cols = ['text', 'text_clean', 'favoriteCount', 'replyToSN', 'truncated', 'replyToSID',
       'replyToUID', 'statusSource', 'retweetCount', 'longitude', 'latitude',
       'id_seccion', 'horaPeticion', 'id_distrito', 'grupoHora',
       'id_seccion_xy', 'favoriteCountOutlier', 'retweetCountOutlier',
       'tweetcount', 'movement', 'language3', 'dayofweek', 'weeknumber',
       'month', 'idBarrio_xy', 'idBarrio', 'user']

english_tweets = english_tweets[cols]

#### In order to train the topic models, 3 different pooling methods for the creation of documents are used: No Pooling (1), User Pooling (2) and Hashtag Pooling (3).

### Training Documents Option 1 (No Pooling)

In [14]:
# treat every tweet as a different document (no pooling)
documents = english_tweets["text_clean"].tolist()

In [116]:
len(documents_hashtag_pooling)

6040

In [117]:
total = []
for item in documents_hashtag_pooling:
    total.append(len(item.split()))

In [118]:
sum = 0
for number in total:
    sum += number

In [119]:
sum

150793

In [120]:
sum/len(documents_hashtag_pooling)

24.96572847682119

### Training Documents Option 2 (User Pooling)

In [15]:
# treat all tweets by one user as one single document (user pooling)
user_combined = english_tweets[["text_clean","user"]].groupby("user")["text_clean"].apply(lambda x: "".join(x))
documents_user_pooling = user_combined.tolist()

### Training Documents Option 3 (Hashtag Pooling)

In [16]:
# treat all tweets with the same hashtag as one single document (hashtag pooling)

# find all hashtags
english_tweets["hashtags"] = english_tweets["text_clean"].str.findall(r'#.*?(?=\s|$)')

# separate hashtags in columns
hashtags_tweets = pd.DataFrame(english_tweets["hashtags"].tolist(),
                               columns=["hashtag1", "hashtag2", "hashtag3", "hashtag4",
                                        "hashtag5", "hashtag6", "hashtag7", "hashtag8",
                                        "hashtag9", "hashtag10", "hashtag11", "hashtag12",
                                        "hashtag13"])

# join hashtags with tweet text
hashtags_tweets.index = english_tweets.index
hashtags_tweets = english_tweets.join(hashtags_tweets)

In [17]:
# create one dataframe with text for each hashtag column and save them in a dictionary
dict = {}
for index, item in enumerate(["hash1", "hash2", "hash3", "hash4", "hash5",
                              "hash6", "hash7", "hash8", "hash9", "hash10",
                              "hash11", "hash12", "hash13"]):
    dict[item] = hashtags_tweets[["hashtag" + str(index + 1), "text_clean"]].copy()
    dict[item].columns = ["hashtag", "text"]
    dict[item].dropna(inplace=True)

# concatenate all dataframes to one dataframe (the result is a dataframe
# where there is text for each hashtag found)
hashtags = pd.DataFrame()
for item in dict:
    hashtags = pd.concat([hashtags, dict[item]])

# combine text for each hashtag
hashtags_combined = hashtags.groupby("hashtag")["text"].apply(lambda x: "".join(x))

In [18]:
# remove some generic hashtags that cover a lot of different topics
hashtags_combined.drop(["#Barcelona", "#Catalunya", "#Spain", "#BCN", "#BARCELONA",
                        "#Espana", "#BarcelonaSpain"], inplace=True)

In [19]:
# create documents
documents_hashtag_pooling = hashtags_combined.tolist()

### Prepare Test Documents

#### The trained topic models will then be used to determine the topics of test documents.

#### The first objective of the research is to analyze the distribution of topics over the districts. For this purpose, district pooling is used to create the documents that will be tested.

In [20]:
# merge all tweets from each district (district pooling) and treat them as one single document respectively
district_combined = english_tweets[["text_clean","id_distrito"]].groupby("id_distrito")["text_clean"].apply(lambda x: "".join(x))
documents_district_pooling = district_combined.tolist()

In [21]:
# check documents
district_combined

id_distrito
801901    #cure Betty Ford's in Barcelona Sunday funday wishing a could skate like a pro #iceskating #ice #winter #fun Winter Where en Black and White Christine The Benefit Concert Nov was a wrap Dreams come true if you work hard for it Mine came  Insider Tips on Tickets Bar by foodieling in Travel with Ling #TicketsBar #Barcelona #brodasbros #artistas #top #recomano anar #cracks #hiphop #tribute #jamesbrown #teatrecondal Plaça Espanya in #Barcelona #Spain #Europe #art #travel #instatravel #instatrip #tour What a night dondiablo #DonDiablo #fcbarcelona #opium #barcelona #spain #hexagon Los Martes Now #languageexchange #intercambiolinguistico #funeventsbcn #meetingpeople preview #kettal #luxury #outdoor #lifestyle #barcelona #made #dining #chair #sofa #chaise Croissant crisis in France #watercolor #winsorandnewton Art i Col lec Claudia Vives Fierro  La check list para gestionar #SocialMedia ¿añadiríais alguna tarea #RedesSociales Spanish trails #photography #barcelona #night

#### The second objective is to look at the dynamic topic development over time. For this purpose, the dataset is divided according to time and documents are created on this basis.

#### Divide dataframe according to month

In [24]:
# sort index
sorted_tweets = english_tweets.sort_index()

In [25]:
# check first and last date
print(sorted_tweets.index[0]) # June 2017
print(sorted_tweets.index[-1]) # December 2017 (very incomplete)

2017-06-11 13:46:35
2017-12-04 21:20:25


In [26]:
# create column that contains the month of the tweets
sorted_tweets['month'] = sorted_tweets.index.month

In [27]:
# # ignore this part! (not needed anymore!)

# # split dataframe according to month
june = sorted_tweets.loc['2017-06-01':'2017-06-30']
july = sorted_tweets.loc['2017-07-01':'2017-07-31']
august = sorted_tweets.loc['2017-08-01':'2017-08-31']
september = sorted_tweets.loc['2017-09-01':'2017-09-30']
october = sorted_tweets.loc['2017-10-01':'2017-10-31']
november = sorted_tweets.loc['2017-11-01':'2017-11-30']
december = sorted_tweets.loc['2017-12-01':'2017-12-31']

In [46]:
len(december["user"].unique())

29

In [47]:
# merge all tweets from each month and treat them as one document respectively
months_combined = english_tweets[["text_clean","month"]].groupby("month")["text_clean"].apply(lambda x: "".join(x))
documents_month_pooling = months_combined.tolist()

In [62]:
months_combined = english_tweets[["text_clean","month"]]

In [78]:
months_combined["month"].unique() # Nov, Dec, Oct, Sep, Aug, July, June

array(['November', 'December', 'October', 'September', 'August', 'July',
       'June'], dtype=object)

In [84]:
documents_month_pooling

["Tot be Parc del Laberint d'Horta I was at Park Güell in Barcelona Just posted a photo Hotel Barcelona Princess What do we think is happening here #barcelona #spain #signs Fòrum Under the solar array #barcelona #redhookcrit Fòrum  en El ClotAragó railway station EdieRodriguez There is no reason to discard any of the pleasures of life EdieRodriguez Cheers from Barcelona Seguimos de formación davines en Davines Exhibition Center #persianasbcn #persianasartisticas #artepersiana #barriodegraciabcn en Rabipelao Gracia Le Whenever my husband and I travel we love to sit outside in a local spot drink beers and play tbh no amount of photos could ever do this place justice absolutely incredible Basílica de la #photography #spain #barcelona #sagradafamilia #lasagradafamilia #colorful #contrast #family In the Sagrada Familia everything is providential Antoni Gaudi #barcelona #holidays Barcelona has been beautiful colourful and warm It was just amazing to spend the last four Just be careful en Sag

In [27]:
# check how many documents
len(documents_month_pooling) # should be 7

7

In [67]:
# merge all tweets from one district and one month and treat them as one document respectively
districts_per_month_combined = sorted_tweets[["text_clean","month","id_distrito"]].groupby(["month","id_distrito"])["text_clean"].apply(lambda x: "".join(x))
documents_district_per_month_pooling = districts_per_month_combined.tolist()

In [70]:
june[june["id_distrito"] == 801901]

Unnamed: 0_level_0,text,text_clean,favoriteCount,replyToSN,truncated,replyToSID,replyToUID,statusSource,retweetCount,longitude,latitude,id_seccion,horaPeticion,id_distrito,grupoHora,id_seccion_xy,favoriteCountOutlier,retweetCountOutlier,tweetcount,movement,language3,dayofweek,weeknumber,month,idBarrio_xy,idBarrio,user,hashtags
created,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
2017-06-12 00:57:35,"Boy, don't hurt your brain. @ Gothic Quarter, Barcelona https://t.co/VYsUlu2N7D",Boy don't hurt your brain Gothic Quarter Barcelona,0,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>",0,2.176944,41.38278,801901025,20170621_105358,801901,00-02,801901025,0,0,2,1.0,ENGLISH,Monday,24,6,2,2,u03881,[]
2017-06-12 01:08:58,Hearbeat <ed><U+00A0><U+00BD><ed><U+00B8><U+008D><ed><U+00A0><U+00BD><ed><U+00B2><U+0096><ed><U+00A0><U+00BD><ed><U+00B9><U+008C><ed><U+00A0><U+00BC><ed><U+00BF><U+00BD><ed><U+00A0><U+00BD><ed><U+00B9><U+0088> #boyfriend #bf #bcn #love #spain #partners #europe #catala #adeu #gordito #cosita https://t.co/6m0J46pFWC,Hearbeat #boyfriend #bf #bcn #love #spain #partners #europe #catala #adeu #gordito #cosita,0,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>",0,2.18814,41.37993,801901040,20170621_105358,801901,00-02,801901040,0,0,6,0.5,ENGLISH,Monday,24,6,3,3,u00625,"[#boyfriend, #bf, #bcn, #love, #spain, #partners, #europe, #catala, #adeu, #gordito, #cosita]"
2017-06-12 10:20:39,sensibistro #Barcelona #tteeturns30 #sydorkosinspain17 #pinxtosforpip One if the best places we https://t.co/TkZABwJNvN,sensibistro #Barcelona #tteeturns #sydorkosinspain #pinxtosforpip One if the best places we,0,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>",0,2.176944,41.38278,801901025,20170621_105358,801901,10-12,801901025,0,0,2,1.0,ENGLISH,Monday,24,6,2,2,u03880,"[#Barcelona, #tteeturns, #sydorkosinspain, #pinxtosforpip]"
2017-06-12 10:34:37,Our last night in #Barcelona having dinner at fondaespanya #sydorkosinspain17 #pinxtosforpip https://t.co/dFUEnAedCp,Our last night in #Barcelona having dinner at fondaespanya #sydorkosinspain #pinxtosforpip,0,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>",0,2.17282,41.37996,801901006,20170621_105358,801901,10-12,801901005,0,0,2,1.0,ENGLISH,Monday,24,6,1,1,u03880,"[#Barcelona, #sydorkosinspain, #pinxtosforpip]"
2017-06-12 10:47:05,feel like engine <ed><U+00A0><U+00BD><ed><U+00B4><U+00A5>\n\nengine // pentagon @ Moll de La Fusta https://t.co/mvLRwQ12dh,feel like engine engine pentagon Moll de La Fusta,0,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>",0,2.180048,41.37765,801901030,20170621_105358,801901,10-12,801901030,0,0,1,1.0,ENGLISH,Monday,24,6,2,2,u10833,[]
2017-06-12 11:07:59,"Nights in Barcelona. #Sunkissed <ed><U+00A0><U+00BD><ed><U+00B2><U+008B><U+2600><U+FE0F><ed><U+00A0><U+00BD><ed><U+00B1><U+0091> @ Gothic Quarter, Barcelona https://t.co/6Pz6T59Epv",Nights in Barcelona #Sunkissed Gothic Quarter Barcelona,0,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>",0,2.176944,41.38278,801901025,20170621_105358,801901,10-12,801901025,0,0,1,1.0,ENGLISH,Monday,24,6,2,2,u10827,[#Sunkissed]
2017-06-12 11:38:52,Always trust the experienced ladies ! They know where it's at for drinks !\n#holidays #apero https://t.co/zuwnFOYU9T,Always trust the experienced ladies They know where it's at for drinks #holidays #apero,0,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>",0,2.176944,41.38278,801901025,20170621_105358,801901,10-12,801901025,0,0,2,0.5,ENGLISH,Monday,24,6,2,2,u03819,"[#holidays, #apero]"
2017-06-12 12:06:57,Lunchtime in the sun #goodafternoon #barcelona #craftbeer @blacklabbcn @ Playa De La Barceloneta https://t.co/SW1ePrmKw6,Lunchtime in the sun #goodafternoon #barcelona #craftbeer blacklabbcn Playa De La Barceloneta,0,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>",0,2.18814,41.37993,801901040,20170621_105358,801901,12-14,801901040,0,0,1,1.0,ENGLISH,Monday,24,6,3,3,u10830,"[#goodafternoon, #barcelona, #craftbeer]"
2017-06-12 12:19:37,"Street photographer @ Gothic Quarter, Barcelona https://t.co/l3Q9y3OvAZ",Street photographer Gothic Quarter Barcelona,0,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>",0,2.176944,41.38278,801901025,20170621_105358,801901,12-14,801901025,0,0,3,0.666667,ENGLISH,Monday,24,6,2,2,u01451,[]
2017-06-12 13:12:52,"Just posted a photo @ C/mònec, 14, Barcelona, Ramon Cuberta Atelier https://t.co/lhyKMFigG3",Just posted a photo Cmònec Barcelona Ramon Cuberta Atelier,0,,False,,,"<a href=""http://instagram.com"" rel=""nofollow"">Instagram</a>",0,2.177715,41.38863,801901052,20170621_220311,801901,12-14,801901051,0,0,16,0.1875,ENGLISH,Monday,24,6,4,4,u00104,[]


In [68]:
districts_per_month_combined[0]

"Boy don't hurt your brain Gothic Quarter Barcelona Hearbeat #boyfriend #bf #bcn #love #spain #partners #europe #catala #adeu #gordito #cosita sensibistro #Barcelona #tteeturns #sydorkosinspain #pinxtosforpip One if the best places we Our last night in #Barcelona having dinner at fondaespanya #sydorkosinspain #pinxtosforpip feel like engine engine pentagon Moll de La Fusta Nights in Barcelona #Sunkissed Gothic Quarter Barcelona Always trust the experienced ladies They know where it's at for drinks #holidays #apero Lunchtime in the sun #goodafternoon #barcelona #craftbeer blacklabbcn Playa De La Barceloneta Street photographer Gothic Quarter Barcelona Just posted a photo Cmònec Barcelona Ramon Cuberta Atelier So exited about my premiere ShokoBCN catch me there Thursday June #shoko #shokobarcelona LAST SUMMER #me #myself #mood #moodday #thisplace #missme #spain #espagne #barcelone The Sands of Time Gothic Quarter Barcelona Just posted a photo Gothic Quarter Barcelona Beer Heaven BCN #bcn

In [29]:
# check how many documents
len(documents_district_per_month_pooling)

62

In [30]:
# inspect dataframe
districts_per_month_combined

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


### Preprocess Documents for NMF Topic Modeling Method

In [31]:
# create copy of no pooling documents
nmf_documents = list(documents)

In [32]:
# transform to lower case
for doc_idx, doc in enumerate(nmf_documents):
    nmf_documents[doc_idx] = nmf_documents[doc_idx].lower()

In [33]:
# delete stopwords
for doc_idx, doc in enumerate(nmf_documents):
    nmf_documents[doc_idx] = doc.replace(" year ", " ").replace(" to ", " ").replace(" on ", " ").replace(" wa ", " ").replace(" #yourup ", " ").replace(" de ", " ").replace(" just ", " ").replace(" posted ", " ").replace(" photo ", " ").replace(" la ", " ").replace(" del ", " ").replace(" en ", " ").replace(" los ", " ").replace(" el ", " ").replace(" las ", " ").replace(" barcelona ", " ").replace(" #bcn ", " ").replace(" just ", " ").replace(" cada ", " ").replace(" nuestra ", " ").replace(" around ", " ").replace(" spanish ", " ").replace(" día ", " ").replace(" dia ", " ").replace(" #photo ", " ").replace(" first ", " ").replace(" thing ", " ").replace(" last ", " ").replace(" #spain ", " ").replace(" carrer ", " ").replace(" make ", " ").replace(" &lt ", " ").replace(" &gt ", " ").replace(" de ", " ").replace(" for ", " ").replace(" a ", " ").replace(" of ", " ").replace(" the ", " ").replace(" and ", " ").replace(" to ", " ").replace(" in ", " ").replace(" at ", " ").replace(" by ", " ").replace(" one ", " ").replace(" day ", " ").replace(" get ", " ").replace(" españa ", " ").replace(" #españa ", " ").replace(" #repost ", " ").replace(" since ", " ").replace(" still ", " ").replace(" never ", " ").replace(" thank ", " ").replace(" two ", " ").replace(" think ", " ").replace(" could ", " ").replace(" many ", " ").replace(" even ", " ").replace(" the ", " ").replace(" igers ", " ").replace(" que ", " ").replace(" many ", " ").replace(" con ", " ").replace(" un ", " ").replace(" wa ", " ").replace(" bcn ", " ").replace(" d'horta ", " ").replace(" ever ", " ").replace(" come ", " ").replace(" #ig ", " ").replace(" el ", " ").replace(" i'm ", " ").replace(" i've ", " ").replace(" always ", " ").replace(" le ", " ").replace(" what's ", " ").replace(" #barcelone ", " ").replace(" like ", " ").replace(" last ", " ").replace(" back ", " ").replace(" thanks ", " ").replace(" #barna ", " ").replace(" spain ", " ").replace(" yo ", " ").replace(" #yo ", " ").replace(" el ", " ").replace(" #el ", " ").replace(" barcelona ", " ").replace(" #barcelona ", " ")    

In [34]:
# display preprocessed documents
nmf_documents

['brew pub try few beers offer #olgodbarcelona #ølgod #triathlontraining ',
 'art is coming face face with yourself parc les tres xemeneies ',
 "today's quickie parc les tres xemeneies ",
 "all hail freshest seafood town let's feast #hadthemostmusselsinmylife #tripoftheyear ",
 'dance beat your own drum #urban #music #live #stage #graffiti #streetart ',
 'i finished walking km h m s with #endomondo #endorphins ',
 '#blackfriday shopping #friends then #bday party mallan great food margaritas more ',
 'another why not pastisseria ',
 'yesterday i got new hairstyle with best hair artist city i cut mychell ',
 'good night tienes don jesusisnard #ootd margot house ',
 'stay warm pic x jesusisnard #sunday margot house ',
 'there is nothing better than waking up this saturdays margothousebarcelona wearing ',
 'drinking pumpkin catalanbrewery cocovailbh ',
 'drinking leyenda cervezadougalls cocovailbh ',
 'barcelonas night clinicasden #event #night #night #clinicasden #sonrisas ',
 'brama cros

### Save Training and Test Documents

In [35]:
with io.open('documents.txt', 'w', encoding='utf-8') as f:
    for item in documents:
        f.write(item + "\n")
with io.open('documents_user_pooling.txt', 'w', encoding='utf-8') as f:
    for item in documents_user_pooling:
        f.write(item + "\n")
with io.open('documents_hashtag_pooling.txt', 'w', encoding='utf-8') as f:
    for item in documents_hashtag_pooling:
        f.write(item + "\n")
with io.open('documents_district_pooling.txt', 'w', encoding='utf-8') as f:
    for item in documents_district_pooling:
        f.write(item + "\n")
with io.open('documents_month_pooling.txt', 'w', encoding='utf-8') as f:
    for item in documents_month_pooling:
        f.write(item + "\n")
with io.open('documents_district_per_month_pooling.txt', 'w', encoding='utf-8') as f:
    for item in documents_district_per_month_pooling:
        f.write(item + "\n")
        
with open('documents_no_pooling.p', 'wb') as fp:
    pickle.dump(documents, fp)
    
with open('documents_user_pooling.p', 'wb') as fp:
    pickle.dump(documents_user_pooling, fp)
    
with open('documents_hashtag_pooling.p', 'wb') as fp:
    pickle.dump(documents_hashtag_pooling, fp)
    
with open('nmf_documents_no_pooling.p', 'wb') as fp:
    pickle.dump(nmf_documents, fp)

In [36]:
nmf_documents

['brew pub try few beers offer #olgodbarcelona #ølgod #triathlontraining ',
 'art is coming face face with yourself parc les tres xemeneies ',
 "today's quickie parc les tres xemeneies ",
 "all hail freshest seafood town let's feast #hadthemostmusselsinmylife #tripoftheyear ",
 'dance beat your own drum #urban #music #live #stage #graffiti #streetart ',
 'i finished walking km h m s with #endomondo #endorphins ',
 '#blackfriday shopping #friends then #bday party mallan great food margaritas more ',
 'another why not pastisseria ',
 'yesterday i got new hairstyle with best hair artist city i cut mychell ',
 'good night tienes don jesusisnard #ootd margot house ',
 'stay warm pic x jesusisnard #sunday margot house ',
 'there is nothing better than waking up this saturdays margothousebarcelona wearing ',
 'drinking pumpkin catalanbrewery cocovailbh ',
 'drinking leyenda cervezadougalls cocovailbh ',
 'barcelonas night clinicasden #event #night #night #clinicasden #sonrisas ',
 'brama cros

### Tokenize Training Documents

In [37]:
# we can simply tokenize by space thanks to the previous preprocessing
texts_no_pooling = [[word for word in document.lower().split()]
          for document in documents]

texts_user_pooling = [[word for word in document.lower().split()]
          for document in documents_user_pooling]

texts_hashtag_pooling = [[word for word in document.lower().split()]
          for document in documents_hashtag_pooling]

### Save Unpreprocessed Tokenized Training Documents

In [38]:
with open('tokenized_documents_no_pooling_unpp.p', 'wb') as fp:
    pickle.dump(texts_no_pooling, fp)
    
with open('tokenized_documents_user_pooling_unpp.p', 'wb') as fp:
    pickle.dump(texts_user_pooling, fp)
    
with open('tokenized_documents_hashtag_pooling_unpp.p', 'wb') as fp:
    pickle.dump(texts_hashtag_pooling, fp)

### Further Preprocessing of Training Documents after Tokenization

In [39]:
# remove numbers, but not words that contain numbers.
texts_no_pooling = [[token for token in doc if not token.isnumeric()] for doc in texts_no_pooling]
texts_user_pooling = [[token for token in doc if not token.isnumeric()] for doc in texts_user_pooling]
texts_hashtag_pooling = [[token for token in doc if not token.isnumeric()] for doc in texts_hashtag_pooling]

In [40]:
# remove words that are only one character.
texts_no_pooling = [[token for token in doc if len(token) > 1] for doc in texts_no_pooling]
texts_user_pooling = [[token for token in doc if len(token) > 1] for doc in texts_user_pooling]
texts_hashtag_pooling = [[token for token in doc if len(token) > 1] for doc in texts_hashtag_pooling]

In [41]:
# lemmatize all words in all documents.
lemmatizer = WordNetLemmatizer()
texts_no_pooling = [[lemmatizer.lemmatize(token) for token in doc] for doc in texts_no_pooling]
texts_user_pooling = [[lemmatizer.lemmatize(token) for token in doc] for doc in texts_user_pooling]
texts_hashtag_pooling = [[lemmatizer.lemmatize(token) for token in doc] for doc in texts_hashtag_pooling]

In [42]:
# # ignore this part! computing bigrams did not improve models but made them worse!

# # compute bigrams
# # add bigrams and trigrams to docs (only ones that appear 5 times or more)
# bigram = Phrases(texts_no_pooling, min_count=10)
# for idx in range(len(texts_no_pooling)):
#     for token in bigram[texts_no_pooling[idx]]:
#         if '_' in token:
#             # Token is a bigram, add to document.
#             texts_no_pooling[idx].append(token)
#             
# bigram = Phrases(texts_user_pooling, min_count=10)
# for idx in range(len(texts_user_pooling)):
#     for token in bigram[texts_user_pooling[idx]]:
#         if '_' in token:
#             # Token is a bigram, add to document.
#             texts_user_pooling[idx].append(token)
#             
# bigram = Phrases(texts_hashtag_pooling, min_count=10)
# for idx in range(len(texts_hashtag_pooling)):
#     for token in bigram[texts_hashtag_pooling[idx]]:
#         if '_' in token:
#             # Token is a bigram, add to document.
#             texts_hashtag_pooling[idx].append(token)

## Save Tokenized Training Documents

In [43]:
with open('tokenized_documents_no_pooling.p', 'wb') as fp:
    pickle.dump(texts_no_pooling, fp)
    
with open('tokenized_documents_user_pooling.p', 'wb') as fp:
    pickle.dump(texts_user_pooling, fp)
    
with open('tokenized_documents_hashtag_pooling.p', 'wb') as fp:
    pickle.dump(texts_hashtag_pooling, fp)

## Refine and Vectorize Corpora

In [44]:
# define function to refine and vectorize corpus 
# (remove stopwords, very frequent and very infrequent words etc.)

# define stopwords
stpwords = "for a of the and to in at by one #yo #el day get españa #yourup #españa #repost yo el since still never thank two think could many even ha igers th que con un wa bcn d'horta ever come #ig el i'm i've always le what's #barcelone like last back thanks #barna spain barcelona #barcelona cada nuestra around spanish día dia #photo first thing last #spain carrer make &lt &gt de la del en las barcelona #bcn just posted photo year wa".split()

def nltk_stopwords():
    return set(nltk.corpus.stopwords.words('english'))

def prep_corpus(docs, 
                additional_stopwords=set(stpwords),
                no_below=2, no_above=0.5,
                dictionary_name='tourism.dict', corpus_name='tourism.mm'):
    print('Building dictionary...')
    dictionary = corpora.Dictionary(docs)
    stopwords = nltk_stopwords().union(additional_stopwords)
    stopword_ids = map(dictionary.token2id.get, stopwords)
    dictionary.filter_tokens(stopword_ids)
    dictionary.compactify()
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
    dictionary.compactify()
    dictionary.save(dictionary_name)  # store the dictionary, for future reference
    
    print('Building corpus...')
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    corpora.MmCorpus.serialize(corpus_name, corpus)  # store to disk, for later use
    
    return (corpus, dictionary)

In [45]:
# run function to vectorize corpora
corpus_no_pooling = prep_corpus(texts_no_pooling,
                                dictionary_name="tourism_no_pooling.dict",
                                corpus_name="tourism_no_pooling.mm")[0]
dictionary_no_pooling = prep_corpus(texts_no_pooling,
                                    dictionary_name="tourism_no_pooling.dict",
                                    corpus_name="tourism_no_pooling.mm")[1]

corpus_user_pooling = prep_corpus(texts_user_pooling,
                                    dictionary_name="tourism_user_pooling.dict",
                                    corpus_name="tourism_user_pooling.mm")[0]
dictionary_user_pooling = prep_corpus(texts_user_pooling,
                                    dictionary_name="tourism_user_pooling.dict",
                                    corpus_name="tourism_user_pooling.mm")[1]

corpus_hashtag_pooling = prep_corpus(texts_hashtag_pooling,
                                    dictionary_name="tourism_hashtag_pooling.dict",
                                    corpus_name="tourism_hashtag_pooling.mm")[0]
dictionary_hashtag_pooling = prep_corpus(texts_hashtag_pooling,
                                    dictionary_name="tourism_hashtag_pooling.dict",
                                    corpus_name="tourism_hashtag_pooling.mm")[1]

2018-10-02 22:38:00,812 : INFO : adding document #0 to Dictionary(0 unique tokens: [])


Building dictionary...


2018-10-02 22:38:01,024 : INFO : built Dictionary(15443 unique tokens: ['#olgodbarcelona', '#triathlontraining', '#ølgod', 'beer', 'brew']...) from 7633 documents (total 78801 corpus positions)
2018-10-02 22:38:01,076 : INFO : discarding 10451 tokens: [('#olgodbarcelona', 1), ('#triathlontraining', 1), ('quickie', 1), ('#hadthemostmusselsinmylife', 1), ('#tripoftheyear', 1), ('freshest', 1), ('hail', 1), ('mallan', 1), ('mychell', 1), ('tienes', 1)]...
2018-10-02 22:38:01,077 : INFO : keeping 4790 tokens which were in no less than 2 and no more than 3816 (=50.0%) documents
2018-10-02 22:38:01,090 : INFO : resulting dictionary: Dictionary(4790 unique tokens: ['#ølgod', 'beer', 'brew', 'offer', 'pub']...)
2018-10-02 22:38:01,104 : INFO : saving Dictionary object under tourism_no_pooling.dict, separately None
2018-10-02 22:38:01,109 : INFO : saved tourism_no_pooling.dict
2018-10-02 22:38:01,271 : INFO : storing corpus in Matrix Market format to tourism_no_pooling.mm
2018-10-02 22:38:01,27

Building corpus...


2018-10-02 22:38:01,349 : INFO : PROGRESS: saving document #2000
2018-10-02 22:38:01,378 : INFO : PROGRESS: saving document #3000
2018-10-02 22:38:01,396 : INFO : PROGRESS: saving document #4000
2018-10-02 22:38:01,415 : INFO : PROGRESS: saving document #5000
2018-10-02 22:38:01,451 : INFO : PROGRESS: saving document #6000
2018-10-02 22:38:01,475 : INFO : PROGRESS: saving document #7000
2018-10-02 22:38:01,497 : INFO : saved 7633x4790 matrix, density=0.101% (37016/36562070)
2018-10-02 22:38:01,499 : INFO : saving MmCorpus index to tourism_no_pooling.mm.index
2018-10-02 22:38:01,507 : INFO : adding document #0 to Dictionary(0 unique tokens: [])


Building dictionary...


2018-10-02 22:38:01,719 : INFO : built Dictionary(15443 unique tokens: ['#olgodbarcelona', '#triathlontraining', '#ølgod', 'beer', 'brew']...) from 7633 documents (total 78801 corpus positions)
2018-10-02 22:38:01,768 : INFO : discarding 10451 tokens: [('#olgodbarcelona', 1), ('#triathlontraining', 1), ('quickie', 1), ('#hadthemostmusselsinmylife', 1), ('#tripoftheyear', 1), ('freshest', 1), ('hail', 1), ('mallan', 1), ('mychell', 1), ('tienes', 1)]...
2018-10-02 22:38:01,771 : INFO : keeping 4790 tokens which were in no less than 2 and no more than 3816 (=50.0%) documents
2018-10-02 22:38:01,784 : INFO : resulting dictionary: Dictionary(4790 unique tokens: ['#ølgod', 'beer', 'brew', 'offer', 'pub']...)
2018-10-02 22:38:01,795 : INFO : saving Dictionary object under tourism_no_pooling.dict, separately None
2018-10-02 22:38:01,803 : INFO : saved tourism_no_pooling.dict
2018-10-02 22:38:01,927 : INFO : storing corpus in Matrix Market format to tourism_no_pooling.mm
2018-10-02 22:38:01,93

Building corpus...


2018-10-02 22:38:02,086 : INFO : PROGRESS: saving document #6000
2018-10-02 22:38:02,108 : INFO : PROGRESS: saving document #7000
2018-10-02 22:38:02,124 : INFO : saved 7633x4790 matrix, density=0.101% (37016/36562070)
2018-10-02 22:38:02,127 : INFO : saving MmCorpus index to tourism_no_pooling.mm.index
2018-10-02 22:38:02,136 : INFO : adding document #0 to Dictionary(0 unique tokens: [])


Building dictionary...


2018-10-02 22:38:02,395 : INFO : built Dictionary(15469 unique tokens: ['castellano', 'catalangov', 'cataluña', 'consecuencia', 'country']...) from 4424 documents (total 78666 corpus positions)
2018-10-02 22:38:02,459 : INFO : discarding 11278 tokens: [('castellano', 1), ('catalangov', 1), ('consecuencia', 1), ('exclusion', 1), ('instil', 1), ('linguistica', 1), ('mala', 1), ('politica', 1), ('practica', 1), ('shameful', 1)]...
2018-10-02 22:38:02,461 : INFO : keeping 3990 tokens which were in no less than 2 and no more than 2212 (=50.0%) documents
2018-10-02 22:38:02,479 : INFO : resulting dictionary: Dictionary(3990 unique tokens: ['cataluña', 'country', 'e', 'fear', 'police']...)
2018-10-02 22:38:02,488 : INFO : saving Dictionary object under tourism_user_pooling.dict, separately None
2018-10-02 22:38:02,494 : INFO : saved tourism_user_pooling.dict
2018-10-02 22:38:02,581 : INFO : storing corpus in Matrix Market format to tourism_user_pooling.mm
2018-10-02 22:38:02,584 : INFO : savi

Building corpus...
Building dictionary...

2018-10-02 22:38:02,745 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-10-02 22:38:02,914 : INFO : built Dictionary(15469 unique tokens: ['castellano', 'catalangov', 'cataluña', 'consecuencia', 'country']...) from 4424 documents (total 78666 corpus positions)
2018-10-02 22:38:02,964 : INFO : discarding 11278 tokens: [('castellano', 1), ('catalangov', 1), ('consecuencia', 1), ('exclusion', 1), ('instil', 1), ('linguistica', 1), ('mala', 1), ('politica', 1), ('practica', 1), ('shameful', 1)]...
2018-10-02 22:38:02,966 : INFO : keeping 3990 tokens which were in no less than 2 and no more than 2212 (=50.0%) documents
2018-10-02 22:38:02,982 : INFO : resulting dictionary: Dictionary(3990 unique tokens: ['cataluña', 'country', 'e', 'fear', 'police']...)
2018-10-02 22:38:02,989 : INFO : saving Dictionary object under tourism_user_pooling.dict, separately None
2018-10-02 22:38:02,995 : INFO : saved tourism_user_pooling.dict
2018-10-02 22:38:03,091 : INFO : storing corpus i


Building corpus...


2018-10-02 22:38:03,231 : INFO : PROGRESS: saving document #4000
2018-10-02 22:38:03,241 : INFO : saved 4424x3990 matrix, density=0.171% (30119/17651760)
2018-10-02 22:38:03,244 : INFO : saving MmCorpus index to tourism_user_pooling.mm.index
2018-10-02 22:38:03,252 : INFO : adding document #0 to Dictionary(0 unique tokens: [])


Building dictionary...


2018-10-02 22:38:03,511 : INFO : built Dictionary(11318 unique tokens: ['#aj', '#amoramargo', '#aperitivo', '#architecture', '#art']...) from 6040 documents (total 146413 corpus positions)
2018-10-02 22:38:03,559 : INFO : discarding 1797 tokens: [("advisor's", 1), ('armada', 1), ('asistidos', 1), ('barrica', 1), ('bcnshowroom', 1), ('cangilon', 1), ('carot', 1), ('comenzamos', 1), ('cryyou', 1), ('cumpliendo', 1)]...
2018-10-02 22:38:03,561 : INFO : keeping 9330 tokens which were in no less than 2 and no more than 3020 (=50.0%) documents
2018-10-02 22:38:03,574 : INFO : resulting dictionary: Dictionary(9330 unique tokens: ['#aj', '#amoramargo', '#aperitivo', '#architecture', '#art']...)
2018-10-02 22:38:03,580 : INFO : saving Dictionary object under tourism_hashtag_pooling.dict, separately None
2018-10-02 22:38:03,590 : INFO : saved tourism_hashtag_pooling.dict
2018-10-02 22:38:03,788 : INFO : storing corpus in Matrix Market format to tourism_hashtag_pooling.mm
2018-10-02 22:38:03,791 

Building corpus...


2018-10-02 22:38:03,827 : INFO : PROGRESS: saving document #1000
2018-10-02 22:38:03,881 : INFO : PROGRESS: saving document #2000
2018-10-02 22:38:03,929 : INFO : PROGRESS: saving document #3000
2018-10-02 22:38:03,967 : INFO : PROGRESS: saving document #4000
2018-10-02 22:38:04,014 : INFO : PROGRESS: saving document #5000
2018-10-02 22:38:04,066 : INFO : PROGRESS: saving document #6000
2018-10-02 22:38:04,070 : INFO : saved 6040x9330 matrix, density=0.140% (78970/56353200)
2018-10-02 22:38:04,072 : INFO : saving MmCorpus index to tourism_hashtag_pooling.mm.index
2018-10-02 22:38:04,081 : INFO : adding document #0 to Dictionary(0 unique tokens: [])


Building dictionary...


2018-10-02 22:38:04,412 : INFO : built Dictionary(11318 unique tokens: ['#aj', '#amoramargo', '#aperitivo', '#architecture', '#art']...) from 6040 documents (total 146413 corpus positions)
2018-10-02 22:38:04,444 : INFO : discarding 1797 tokens: [("advisor's", 1), ('armada', 1), ('asistidos', 1), ('barrica', 1), ('bcnshowroom', 1), ('cangilon', 1), ('carot', 1), ('comenzamos', 1), ('cryyou', 1), ('cumpliendo', 1)]...
2018-10-02 22:38:04,446 : INFO : keeping 9330 tokens which were in no less than 2 and no more than 3020 (=50.0%) documents
2018-10-02 22:38:04,461 : INFO : resulting dictionary: Dictionary(9330 unique tokens: ['#aj', '#amoramargo', '#aperitivo', '#architecture', '#art']...)
2018-10-02 22:38:04,469 : INFO : saving Dictionary object under tourism_hashtag_pooling.dict, separately None
2018-10-02 22:38:04,478 : INFO : saved tourism_hashtag_pooling.dict
2018-10-02 22:38:04,636 : INFO : storing corpus in Matrix Market format to tourism_hashtag_pooling.mm
2018-10-02 22:38:04,638 

Building corpus...


2018-10-02 22:38:04,774 : INFO : PROGRESS: saving document #3000
2018-10-02 22:38:04,840 : INFO : PROGRESS: saving document #4000
2018-10-02 22:38:04,890 : INFO : PROGRESS: saving document #5000
2018-10-02 22:38:04,936 : INFO : PROGRESS: saving document #6000
2018-10-02 22:38:04,938 : INFO : saved 6040x9330 matrix, density=0.140% (78970/56353200)
2018-10-02 22:38:04,941 : INFO : saving MmCorpus index to tourism_hashtag_pooling.mm.index


## Apply Function to Preprocess Test Documents (Before Testing Them with Topic Models)

#### This function has to include all the same steps that were applied to the training documents!

In [46]:
# define function
def preprocess(docs):
    ''' Conduct all preprocessing steps that are conducted to train the LDA model'''
    
    # tokenize documents
    tokenized = [[word for word in document.lower().split()]
          for document in docs]
    
    # remove words that are only one character
    tokenized = [[token for token in doc if len(token) > 1] for doc in tokenized]
    
    # lemmatize all words
    lemmatizer = WordNetLemmatizer()
    lemmatized = [[lemmatizer.lemmatize(token) for token in doc] for doc in tokenized]
    
    # define stopwords
    stpwords = "for a of the and to in at by one #yo #el day get #yourup españa #españa #repost yo el since still never thank two think could many even ha igers th que con un wa bcn d'horta ever come #ig el i'm i've always le what's #barcelone like last back thanks #barna spain barcelona #barcelona cada nuestra around spanish día dia #photo first thing last #spain carrer make &lt &gt de la del en las barcelona #bcn just posted photo year wa".split()
    
    # get stopwords from nltk
    def nltk_stopwords():
        return set(nltk.corpus.stopwords.words('english'))

    # combine stopwords
    stopwords = nltk_stopwords().union(stpwords)
    
    # remove stopwords
    preprocessed = [[token for token in document if token not in stopwords] for document in lemmatized]
    
    return preprocessed

In [48]:
# apply function to test documents
texts_district_pooling = preprocess(documents_district_pooling)
texts_month_pooling = preprocess(documents_month_pooling)
texts_district_per_month_pooling = preprocess(documents_district_per_month_pooling)

## Save Preprocessed Test Documents

In [49]:
with open('tokenized_documents_district_pooling.p', 'wb') as fp:
    pickle.dump(texts_district_pooling, fp)
    
with open('tokenized_documents_month_pooling.p', 'wb') as fp:
    pickle.dump(texts_month_pooling, fp)
    
with open('tokenized_documents_district_per_month_pooling.p', 'wb') as fp:
    pickle.dump(texts_district_per_month_pooling, fp)

In [50]:
# # ignore this part! just example code!

# # map tokens to ids
# print(dictionary_no_pooling.token2id)
# print(dictionary_user_pooling.token2id)
# print(dictionary_hashtag_pooling.token2id)

In [51]:
# # ignore this part! just example code!

# # convert new document to vector 
# new_doc = "Sagrada Familia is amazing"
# new_vec_no_pooling = dictionary_no_pooling.doc2bow(new_doc.lower().split())
# print(new_vec_no_pooling)

In [52]:
# # ignore this part! not needed for dataset!

# # corpus streaming: one document at a time
# class MyCorpus(object):
#     def __iter__(self):
#         for line in open("corpus_no_pooling.txt"):
#             # assume there's one document per line, tokens separated by whitespace
#             yield dictionary.doc2bow(line.lower().split())
#             
# corpus_memory_friendly = MyCorpus()  # doesn't load the corpus into memory!
# print(corpus_memory_friendly)
# 
# for vector in corpus_memory_friendly:  # load one vector into memory at a time
#     print(vector)