# West Ham Topic Modelling

Here I will try Topic Modelling on some of the West Ham text

In [None]:
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords
import nltk

import pandas as pd
import numpy as np

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\denni\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [None]:
nltk.download('stopwords')

In [None]:
NUM_TOPICS = 10
STOPWORDS = stopwords.words('english')
STOPWRODS = STOPWORDS.append('wrote')

def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t) and pd.notnull(t)]
    return cleaned_text

In [None]:
STOPWORDS

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [None]:
# Load some data
hammerchat = pd.read_csv('HC_combined_df.csv')
print (hammerchat.shape)
hammerchat.head()

(6100, 7)


Unnamed: 0.1,Unnamed: 0,author,content,link,post_date,thread_link,thread_title
0,2,redhammer,Bournemouth vlog,http://www.hammerschat.com/viewtopic.php?f=11&...,"r»Sun Feb 04, 2018 7:22 pm",http://www.hammerschat.com/viewtopic.php?f=11&...,"HC Videos - cup of tea, mike, vlog & more"
1,3,redhammer,"Mike on Monday added, cup of tea tomorrow nigh...",http://www.hammerschat.com/viewtopic.php?f=11&...,"r»Mon Feb 05, 2018 9:04 pm",http://www.hammerschat.com/viewtopic.php?f=11&...,Re: HC Videos
2,4,redhammer,cup of tea added if you have an hour to kill!!...,http://www.hammerschat.com/viewtopic.php?f=11&...,"r»Wed Feb 07, 2018 8:56 pm",http://www.hammerschat.com/viewtopic.php?f=11&...,Re: HC Videos
3,5,belfasthammer,Thanks Geo Looking forward to COT as always,http://www.hammerschat.com/viewtopic.php?f=11&...,"r»Thu Feb 08, 2018 4:24 pm",http://www.hammerschat.com/viewtopic.php?f=11&...,Re: HC Videos
4,6,Newmarket,Looks like a proper ....,http://www.hammerschat.com/viewtopic.php?f=11&...,"t»Thu Feb 15, 2018 9:03 pm",http://www.hammerschat.com/viewtopic.php?f=11&...,Re: The Debate #2


In [None]:
hammerchat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6100 entries, 0 to 6099
Data columns (total 7 columns):
Unnamed: 0      6100 non-null int64
author          6093 non-null object
content         5725 non-null object
link            6100 non-null object
post_date       6097 non-null object
thread_link     6100 non-null object
thread_title    6093 non-null object
dtypes: int64(1), object(6)
memory usage: 333.7+ KB


In [None]:
# Reduce the Dataframe
hammerchat = hammerchat.dropna(subset=['content'])
print(hammerchat.shape)

(5725, 7)


In [None]:
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for q,text in enumerate(hammerchat.content):
    tokenized_data.append(clean_text(text))


print(len(tokenized_data))

5725


In [None]:
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)

# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]

# Have a look at how the 20th document looks like: [(word_id, count), ...]
print(corpus[20])
# [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2),  ...

# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

# Build the LSI model
#lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)


[(40, 1), (47, 1), (57, 1), (65, 1), (84, 1), (106, 1), (121, 1), (143, 1), (191, 1), (247, 2), (248, 1), (249, 1), (250, 2), (251, 1), (252, 1), (253, 1), (254, 1), (255, 1), (256, 1), (257, 1), (258, 1), (259, 1), (260, 2), (261, 1), (262, 1), (263, 1), (264, 1), (265, 1)]


In [None]:
lda_model.print_topics(num_words=8)

[(0,
  '0.009*"get" + 0.009*"would" + 0.009*"like" + 0.008*"got" + 0.006*"loan" + 0.005*"want" + 0.005*"buy" + 0.005*"never"'),
 (1,
  '0.010*"would" + 0.007*"one" + 0.007*"get" + 0.006*"fans" + 0.006*"say" + 0.005*"could" + 0.005*"think" + 0.005*"mate"'),
 (2,
  '0.009*"think" + 0.008*"one" + 0.007*"players" + 0.007*"transfer" + 0.006*"would" + 0.006*"palerider" + 0.005*"get" + 0.005*"deal"'),
 (3,
  '0.017*"genuinely" + 0.017*"amused" + 0.011*"play" + 0.007*"one" + 0.007*"mario" + 0.007*"know" + 0.006*"arnie" + 0.006*"back"'),
 (4,
  '0.012*"ham" + 0.010*"would" + 0.009*"player" + 0.008*"west" + 0.008*"like" + 0.008*"good" + 0.007*"team" + 0.007*"one"'),
 (5,
  '0.012*"get" + 0.011*"good" + 0.009*"game" + 0.008*"season" + 0.007*"would" + 0.007*"think" + 0.006*"one" + 0.006*"games"'),
 (6,
  '0.016*"ham" + 0.013*"west" + 0.010*"man" + 0.010*"would" + 0.008*"palace" + 0.008*"city" + 0.007*"one" + 0.006*"club"'),
 (7,
  '0.008*"would" + 0.008*"time" + 0.008*"cardiff" + 0.007*"linked" + 

In [None]:
topics = lda_model.print_topics(num_words=8)

In [None]:
topics_df = pd.DataFrame(topics)

In [None]:
topics_df.to_csv('Hammerschat_topics.csv')