In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import glob
import nltk
import sklearn
import pandas


In [None]:
import os

In [None]:
base_dir= '/content/drive/MyDrive/Topic_modeling/BBC/bbc'

In [None]:
business_file_list = glob.glob(os.path.join(os.getcwd(), base_dir, "business", "*.txt"))
entertainment_file_list = glob.glob(os.path.join(os.getcwd(), base_dir, "entertainment", "*.txt"))
politics_file_list = glob.glob(os.path.join(os.getcwd(), base_dir, "politics", "*.txt"))
sport_file_list = glob.glob(os.path.join(os.getcwd(), base_dir, "sport", "*.txt"))
tech_file_list = glob.glob(os.path.join(os.getcwd(), base_dir, "tech", "*.txt"))

In [None]:
labels = []
corpus = []
for file_list in [
    business_file_list, entertainment_file_list, politics_file_list, sport_file_list, tech_file_list
]:
    for file_path in file_list:
        with open(file_path, encoding="utf8", errors='ignore') as f_input:
            corpus.append((f_input.read()))
            labels.append(file_path.split('/')[-2])

In [None]:
import nltk
import re
nltk.download('punkt')

regex = re.compile('[^a-zA-Z]')

def tokenize(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            clean_word = regex.sub('', word)
            tokens.append(clean_word.lower())
    return tokens

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
tokenized = tokenize(corpus[500])
tokenized[:10]

['ailing',
 'eurodisney',
 'vows',
 'turnaround',
 'eurodisney',
 '',
 'the',
 'european',
 'home',
 'of']

In [None]:
from nltk.corpus import stopwords as sw
nltk.download ('stopwords')
stopwords = sw.words('english')
cleaned = [word for word in tokenized if word not in stopwords and word is not '']
cleaned[:10]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['ailing',
 'eurodisney',
 'vows',
 'turnaround',
 'eurodisney',
 'european',
 'home',
 'mickey',
 'mouse',
 'friends']

In [None]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")

def stem(word):
    return stemmer.stem(word).strip()

In [None]:
stemed = [stem(word) for word in cleaned]
stemed[:10]

['ail',
 'eurodisney',
 'vow',
 'turnaround',
 'eurodisney',
 'european',
 'home',
 'mickey',
 'mous',
 'friend']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000,
                                 min_df=0.05, stop_words=stopwords,
                                 use_idf=True, tokenizer=tokenize,
                                  lowercase=True, preprocessor=stem)

In [None]:
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus);

  'stop_words.' % sorted(inconsistent))


In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(tfidf_matrix);

In [None]:
for i,topic in enumerate(lda.components_):
    print(f'Topic #{i}:')
    print([tfidf_vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Topic #0:
['people', 'election', 'government', 'blair', 'party', 'best', 'would', 'labour', 'film', 'mr']


Topic #1:
['economy', 'sales', 'bank', 'growth', 'firm', 'market', 'year', 'company', 'us', 'bn']


Topic #2:
['people', 'million', 'players', 'sold', 'phone', 'digital', 'video', 'music', 'games', 'mobile']


Topic #3:
['first', 'injury', 'players', 'match', 'cup', 'club', 'win', 'nt', 'england', 'game']


Topic #4:
['information', 'online', 'internet', 'use', 'net', 'computer', 'technology', 'software', 'users', 'people']




In [None]:
topic_values = lda.transform(tfidf_matrix)
doc_num, topic_num = topic_values.shape

In [None]:
import pandas as pd
df = pd.DataFrame({'document': corpus, 'label': labels, 'lda': topic_values.argmax(axis=1)})
df.groupby(['label', 'lda']).count().unstack()

Unnamed: 0_level_0,document,document,document,document,document
lda,0,1,2,3,4
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
business,20.0,484.0,,2.0,4.0
entertainment,357.0,13.0,,12.0,4.0
politics,399.0,11.0,,1.0,6.0
sport,14.0,4.0,,493.0,
tech,24.0,23.0,28.0,21.0,305.0


In [None]:
prob_matrix = lda.transform(tfidf_matrix)
prob_matrix

array([[0.03070895, 0.87843451, 0.03021758, 0.03035699, 0.03028196],
       [0.03548819, 0.85675877, 0.03587479, 0.03665912, 0.03521913],
       [0.38406674, 0.53037729, 0.03021028, 0.02736865, 0.02797704],
       ...,
       [0.03592823, 0.0372588 , 0.41482038, 0.03560272, 0.47638988],
       [0.02924388, 0.03011032, 0.029197  , 0.02878534, 0.88266346],
       [0.02534389, 0.15885096, 0.43616339, 0.02509567, 0.35454609]])