In [1]:
import xml.etree.ElementTree as ET
from os import listdir
import pandas as pd
import nltk
from regex import search
from gensim import corpora, models
from itertools import chain



## Grab and format the data

In [2]:
files = [ET.parse('../data/articles/{}'.format(a)) for a in listdir('../data/articles') if "_2016" in a]

In [3]:
print('Number of files retrieved: {:,}'.format(len(files)))

Number of files retrieved: 464


In [4]:
publications = []
titles = []
dates = []
text = []

for file in files:
    publications.append(file.find("publication").text)
    titles.append(file.find("title").text)
    dates.append(file.find("date_published").text)
    text.append(file.find("text").text)
    
df = pd.DataFrame({'publication': publications, 'text': text, 'date': dates, 'title': titles})

In [5]:
df.head()

Unnamed: 0,date,publication,text,title
0,2016-09-16T00:00:00Z,THE DAILY TELEGRAPH (LONDON),Importance of having a good scrum-half is best...,Why Fotuali'i has put Bath on cloud nine - AUS...
1,2016-09-16T00:00:00Z,THE GUARDIAN,Bringing characters back from the dead is a ti...,Neighbours' Madeleine West to return to Ramsay...
2,2016-09-16T00:00:00Z,THE DAILY TELEGRAPH (LONDON),Forty Autumns by Nina Willner\n\nA family's st...,bookshop
3,2016-09-16T00:00:00Z,THE GUARDIAN,Related: Party politics: why grime defines t...,"Youth in revolt: is 2016 a new dawn for young,..."
4,2016-09-16T00:00:00Z,THE GUARDIAN,They are avant-garde and often tricky to wear ...,Avant-garde Alexander Calder jewellery to go o...


In [20]:
def get_cleaned_tokens(sentence):
    """Returns a list of tokens with the tokens containing stopwords and tokens containing only punctuation or digits removed."""
    tokens = [a for a in nltk.word_tokenize(sentence) 
              if not search(r"^[\p{P}|\d]+$", a) and a not in nltk.corpus.stopwords.words( 'english' )]
    return tokens

def tokenize_text(text):
    """Returns a list of lists wherein each list entry represents a sentence in the form of tokens.
    
    All entries in the list are lowercased.
    """
    text = text.lower()
    word_tokens = [get_cleaned_tokens(sentence) for sentence in nltk.sent_tokenize(text)]
    return word_tokens

In [21]:
df['title_tokens'] = df['title'].map(tokenize_text)

In [22]:
df.head()

Unnamed: 0,date,publication,text,title,title_tokens
0,2016-09-16T00:00:00Z,THE DAILY TELEGRAPH (LONDON),Importance of having a good scrum-half is best...,Why Fotuali'i has put Bath on cloud nine - AUS...,"[[fotuali'i, put, bath, cloud, nine, austin, h..."
1,2016-09-16T00:00:00Z,THE GUARDIAN,Bringing characters back from the dead is a ti...,Neighbours' Madeleine West to return to Ramsay...,"[[neighbours, madeleine, west, return, ramsay,..."
2,2016-09-16T00:00:00Z,THE DAILY TELEGRAPH (LONDON),Forty Autumns by Nina Willner\n\nA family's st...,bookshop,[[bookshop]]
3,2016-09-16T00:00:00Z,THE GUARDIAN,Related: Party politics: why grime defines t...,"Youth in revolt: is 2016 a new dawn for young,...","[[youth, revolt, new, dawn, young, politicised..."
4,2016-09-16T00:00:00Z,THE GUARDIAN,They are avant-garde and often tricky to wear ...,Avant-garde Alexander Calder jewellery to go o...,"[[avant-garde, alexander, calder, jewellery, g..."


In [23]:
df.iloc[0]['title_tokens']

[["fotuali'i", 'put', 'bath', 'cloud', 'nine', 'austin', 'healey']]

## Do some modeling

In [24]:
dictionary = corpora.Dictionary(df['title_tokens'].map(chain.from_iterable))
corpus =  [dictionary.doc2bow(text) for text in df['title_tokens'].map(chain.from_iterable)]

In [25]:
print('Num words in dictionary: {:,}\nSome examples and counts: {}'.format(len(dictionary.token2id.items()), list(dictionary.token2id.items())[0:10]))

Num words in dictionary: 3,359
Some examples and counts: [("fotuali'i", 0), ('put', 1), ('bath', 2), ('cloud', 3), ('nine', 4), ('austin', 5), ('healey', 6), ('neighbours', 7), ('madeleine', 8), ('west', 9)]


In [26]:
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [27]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
lsi.print_topics(2)

[(0,
  '0.242*"trump" + 0.231*"love" + 0.199*"\'s" + 0.198*"us" + 0.144*"still" + 0.139*"first" + 0.138*"clinton" + 0.132*"team" + 0.129*"hillary" + 0.128*"could"'),
 (1,
  '0.354*"love" + 0.184*"asperger" + 0.184*"syndrome" + 0.184*"missile" + 0.184*"appeal" + 0.184*"hacking" + 0.184*"extradition" + 0.184*"defence" + 0.184*"99-year" + 0.184*"computer"')]

## Now what if we try to find the number of topics in a more interesting way...

In [29]:
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt

In [None]:
# k means determine k
distortions = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k).fit(X)
    kmeanModel.fit(X)
    distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
 
# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()