# West Ham Topic Modelling

Here I will try Topic Modelling on some of the West Ham text

In [1]:
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords
import nltk

import pandas as pd
import numpy as np

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dglover\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dglover\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
NUM_TOPICS = 10
STOPWORDS = stopwords.words('english')
STOPWRODS = STOPWORDS.append('wrote')

def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t) and pd.notnull(t)]
    return cleaned_text

In [5]:
STOPWORDS

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [8]:
# Load some data
hammerchat = pd.read_csv('../data/arxiv_papers_full_v2.csv')
print (hammerchat.shape)
hammerchat.head()

(3558, 11)


Unnamed: 0.1,Unnamed: 0,Title,PDF URL,Author,DOI,Published Date,Summary,Journal Ref,Primary Category,Category,Entry ID
0,0,Natural Language Processing using Hadoop and K...,http://arxiv.org/pdf/1608.04434v1,"[arxiv.Result.Author('Emre Erturk'), arxiv.Res...",,2016-08-15 23:09:21+00:00,"Natural language processing, as a data analyti...",,cs.CL,['cs.CL'],http://arxiv.org/abs/1608.04434v1
1,1,Integrating AI Planning with Natural Language ...,http://arxiv.org/pdf/2202.07138v2,"[arxiv.Result.Author('Kebing Jin'), arxiv.Resu...",,2022-02-15 02:19:09+00:00,Natural language processing (NLP) aims at inve...,,cs.AI,"['cs.AI', 'cs.CL']",http://arxiv.org/abs/2202.07138v2
2,2,Simple Natural Language Processing Tools for D...,http://arxiv.org/pdf/1906.11608v2,[arxiv.Result.Author('Leon Derczynski')],,2019-06-27 13:15:12+00:00,This technical note describes a set of baselin...,,cs.CL,['cs.CL'],http://arxiv.org/abs/1906.11608v2
3,3,Towards the Study of Morphological Processing ...,http://arxiv.org/pdf/2006.16212v1,"[arxiv.Result.Author('Mirinso Shadang'), arxiv...",,2020-06-29 17:24:09+00:00,There is no or little work on natural language...,In proceeding of Regional International Confer...,cs.CL,['cs.CL'],http://arxiv.org/abs/2006.16212v1
4,4,Natural Language Understanding with Distribute...,http://arxiv.org/pdf/1511.07916v1,[arxiv.Result.Author('Kyunghyun Cho')],,2015-11-24 23:23:13+00:00,This is a lecture note for the course DS-GA 30...,,cs.CL,"['cs.CL', 'stat.ML']",http://arxiv.org/abs/1511.07916v1


In [9]:
hammerchat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3558 entries, 0 to 3557
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        3558 non-null   int64 
 1   Title             3558 non-null   object
 2   PDF URL           3558 non-null   object
 3   Author            3558 non-null   object
 4   DOI               403 non-null    object
 5   Published Date    3558 non-null   object
 6   Summary           3558 non-null   object
 7   Journal Ref       515 non-null    object
 8   Primary Category  3558 non-null   object
 9   Category          3558 non-null   object
 10  Entry ID          3558 non-null   object
dtypes: int64(1), object(10)
memory usage: 305.9+ KB


In [10]:
# Reduce the Dataframe
hammerchat = hammerchat.dropna(subset=['Summary'])
print(hammerchat.shape)

(3558, 11)


In [12]:
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for q,text in enumerate(hammerchat['Summary']):
    tokenized_data.append(clean_text(text))


print(len(tokenized_data))

3558


In [14]:
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)

# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]

# Have a look at how the 20th document looks like: [(word_id, count), ...]
print(corpus[20])
# [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2),  ...

# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

# Build the LSI model
#lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)


[(28, 4), (31, 1), (39, 1), (45, 1), (52, 1), (76, 1), (92, 1), (115, 3), (184, 2), (196, 1), (201, 1), (206, 2), (207, 1), (212, 2), (294, 1), (357, 1), (450, 1), (504, 2), (556, 1), (622, 3), (801, 1), (802, 1), (803, 1), (804, 1), (805, 1), (806, 2), (807, 1), (808, 1), (809, 1), (810, 1), (811, 1), (812, 1), (813, 1), (814, 1), (815, 1), (816, 1), (817, 1), (818, 1), (819, 1), (820, 1), (821, 1), (822, 1)]


In [15]:
lda_model.print_topics(num_words=8)

[(0,
  '0.014*"natural" + 0.014*"language" + 0.013*"nlp" + 0.009*"processing" + 0.008*"learning" + 0.007*"models" + 0.007*"research" + 0.007*"data"'),
 (1,
  '0.017*"language" + 0.011*"natural" + 0.008*"model" + 0.008*"research" + 0.007*"processing" + 0.006*"models" + 0.006*"speech" + 0.006*"nlp"'),
 (2,
  '0.026*"language" + 0.014*"natural" + 0.012*"models" + 0.009*"processing" + 0.009*"tasks" + 0.009*"nlp" + 0.007*"learning" + 0.007*"data"'),
 (3,
  '0.018*"language" + 0.015*"embeddings" + 0.012*"models" + 0.011*"word" + 0.011*"natural" + 0.010*"processing" + 0.008*"nlp" + 0.008*"task"'),
 (4,
  '0.024*"language" + 0.019*"models" + 0.014*"natural" + 0.011*"tasks" + 0.010*"model" + 0.008*"data" + 0.008*"nlp" + 0.008*"processing"'),
 (5,
  '0.023*"language" + 0.012*"processing" + 0.010*"natural" + 0.008*"nlp" + 0.007*"text" + 0.006*"data" + 0.006*"models" + 0.006*"word"'),
 (6,
  '0.032*"language" + 0.017*"models" + 0.013*"natural" + 0.009*"model" + 0.009*"processing" + 0.007*"paper" +

In [16]:
topics = lda_model.print_topics(num_words=8)

In [17]:
topics_df = pd.DataFrame(topics)

In [18]:
topics_df

Unnamed: 0,0,1
0,0,"0.014*""natural"" + 0.014*""language"" + 0.013*""nl..."
1,1,"0.017*""language"" + 0.011*""natural"" + 0.008*""mo..."
2,2,"0.026*""language"" + 0.014*""natural"" + 0.012*""mo..."
3,3,"0.018*""language"" + 0.015*""embeddings"" + 0.012*..."
4,4,"0.024*""language"" + 0.019*""models"" + 0.014*""nat..."
5,5,"0.023*""language"" + 0.012*""processing"" + 0.010*..."
6,6,"0.032*""language"" + 0.017*""models"" + 0.013*""nat..."
7,7,"0.027*""language"" + 0.013*""natural"" + 0.010*""le..."
8,8,"0.030*""language"" + 0.013*""natural"" + 0.010*""mo..."
9,9,"0.023*""language"" + 0.015*""model"" + 0.013*""natu..."


In [None]:
#topics_df.to_csv('Hammerschat_topics.csv')