# Topic modelling
## Unsupervised learning of topics in a text
### using Latent Dirchlet Allocation (via sklearn)
Topic modelling can be thought of as dimensionality reduction:  
Documents are represented as sets of topics  
Each topic has a weight

In [17]:
import re
import pandas as pd
import sklearn
import csv
import nltk
import string
from nltk.stem.snowball import SnowballStemmer
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# use CountVectorizer to turn the docs into vectors
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [3]:
# create the stemmer
stemmer = SnowballStemmer('english')

In [5]:
# if you're on colab upload the data files
from google.colab import files
uploaded = files.upload()

Saving bbc-text.csv to bbc-text.csv
Saving stopwords.csv to stopwords.csv


In [6]:
# helper functions
stopwords_file_path = "stopwords.csv"

def read_in_csv(csv_file):
    with open(csv_file, 'r', encoding='utf-8') as fp:
        reader = csv.reader(fp, delimiter=',', quotechar='"')
        data_read = [row for row in reader]
    return data_read

def get_stopwords(path=stopwords_file_path):
    stopwords = read_in_csv(path)
    stopwords = [word[0] for word in stopwords]
    stemmed_stopwords = [stemmer.stem(word) for word in stopwords]
    stopwords = stopwords + stemmed_stopwords
    return stopwords

def tokenize_and_stem(sentence):
    tokens = nltk.word_tokenize(sentence)
    filtered_tokens = [t for t in tokens if t not in stopwords and t not in string.punctuation and re.search('[a-zA-Z]', t)]
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

### We’ll use a public dataset from the BBC comprised of 2,225 articles  
Each labeled under one of 5 categories: business, entertainment, politics, sport or tech

In [7]:
# read in our data
stopwords_file_path = "stopwords.csv"
stopwords = get_stopwords(stopwords_file_path)
bbc_dataset = "bbc-text.csv"

In [8]:
# turn the documents into vectors
def create_count_vectorizer(documents):
    count_vectorizer = CountVectorizer(stop_words=stopwords, tokenizer=tokenize_and_stem, max_features=1500)
    data = count_vectorizer.fit_transform(documents)
    return (count_vectorizer, data)

In [9]:
# remove unwanted characters (keep just words and spaces)
def clean_data(df):
    df['text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x))
    df['text'] = df['text'].apply(lambda x: re.sub(r'\d', '', x))
    return df

In [10]:
# create the LDA model (note that usually num_topics is unknown)
def create_and_fit_lda(data, num_topics):
    lda = LDA(n_components=num_topics, n_jobs=-1)
    lda.fit(data)
    return lda

In [11]:
# identify & print the most common topic words
def get_most_common_words_for_topics(model, vectorizer, n_top_words):
    words = vectorizer.get_feature_names()
    word_dict = {}
    for topic_index, topic in enumerate(model.components_):
        this_topic_words = [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        word_dict[topic_index] = this_topic_words
    return word_dict

def print_topic_words(word_dict):
    for key in word_dict.keys():
        print(f"Topic {key}")
        print("\t", word_dict[key])

In [12]:
# read in the data, clean it, get text
df = pd.read_csv(bbc_dataset)
df = clean_data(df)
documents = df['text']

# set number of topics (note that usually this is unknown)
number_topics = 5

### Step 1: Extract one of the categories  
Select a particular category from the dataframe, e.g. tech

In [13]:
df = df.loc[df['category']=='tech']

In [14]:
documents = df['text']

In [15]:
# check the output
df

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
19,tech,games maker fights for survival one of britain...
20,tech,security warning over fbi virus the us feder...
21,tech,halo heralds traffic explosion the growing po...
24,tech,mobile audio enters new dimension as mobile ph...
...,...,...
2204,tech,argonaut founder rebuilds empire jez san the ...
2207,tech,california sets fines for spyware the makers o...
2213,tech,progress on new internet domains by early the...
2215,tech,junk e mails on relentless rise spam traffic i...


In [18]:
# create vectorizer & model
(vectorizer, data) = create_count_vectorizer(documents)
lda = create_and_fit_lda(data, number_topics)



### Step 2: Inspect the results  
Are they coherent? Do they seem to be different topics?

In [19]:
# inspect the contents of the topics
topic_words = get_most_common_words_for_topics(lda, vectorizer, 10)
print_topic_words(topic_words)

Topic 0
	 ['tv', 'broadband', 'servic', 'peopl', 'digit', 'high', 'uk', 'content', 'technolog', 'bt']
Topic 1
	 ['softwar', 'secur', 'virus', 'mail', 'site', 'user', 'firm', 'use', 'comput', 'program']
Topic 2
	 ['music', 'technolog', 'gadget', 'player', 'game', 'soni', 'digit', 'devic', 'market', 'year']
Topic 3
	 ['mobil', 'phone', 'use', 'peopl', 'servic', 'search', 'user', 'technolog', 'net', 'get']
Topic 4
	 ['game', 'use', 'comput', 'develop', 'peopl', 'play', 'time', 'make', 'new', 'year']




### Step 3: Try a different category  
Select a different category from the dataframe, e.g. sport

### Step 4: Inspect the results  
Are they coherent? Do they seem to be different topics?

### Step 5: Try different values of N  
Return to Step 1 and repeat the process with a different number of topics