## Unsupervised Text Classification

### The data files were taken from https://www.kaggle.com/rounakbanik/ted-talks

In [1]:
from __future__ import print_function
import nltk, re, pickle, os
import pandas as pd
import numpy as np
from time import time

from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize, MWETokenizer
from nltk.stem import porter, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.util import ngrams

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

## Data Preprocessing

In [2]:
ted_main = pd.read_csv('data/ted_main.csv')
ted_trans = pd.read_csv('data/transcripts.csv')    
ted_all = pd.merge(ted_trans,right=ted_main,on='url')

In [3]:
ted_all['id'] = ted_all.index
print(len(ted_all))
talks = ted_all['transcript']
print(len(talks))

2467
2467


In [4]:
def clean_text(text):

    lemmizer = WordNetLemmatizer()

    stop = stopwords.words('english')
    stop += ['.', ',',':','...','!"','?"', "'", '"',' - ',' — ',',"','."','!', ';','♫♫','♫','ca','em','mr','000','yes'\
             '.\'"','[',']','—',".\'", 'ok','okay','yeah','ya','stuff', ' 000 ',' em ',"ll","didn",'bg','looking'\
             ' oh ','thank','thanks','la','was','wa','?','like','go',' le ',' ca ',' I '," ? ","s", " t ","ve","re"]

    cleaned_text = []
    
    for post in text:
        cleaned_words = []
        
        # remove parentheticals
        clean_parens = re.sub(r'\([^)]*\)', ' ', post)
        
        for word  in wordpunct_tokenize(clean_parens):  
            
            if word.lower() not in stop:
                low_word = lemmizer.lemmatize(word)    

                if low_word.lower() not in stop: 

                    cleaned_words.append(low_word.lower())
   
        cleaned_text.append(' '.join(cleaned_words))
    
    return cleaned_text

In [5]:
t0 = time()
cleaned_talks = clean_text(talks)
print("Cleaned data in %0.3fs." % (time() - t0))

Cleaned data in 65.835s.


## LDA

In [6]:
def topic_mod_lda(data,topics=5,iters=10,ngram_min=1, ngram_max=3, max_df=0.6, max_feats=5000):
    
    vectorizer = CountVectorizer(ngram_range=(ngram_min,ngram_max), 
                             stop_words='english', 
                             max_df = max_df, 
                             max_features=max_feats)

    vect_data = vectorizer.fit_transform(data)
    lda = LatentDirichletAllocation(n_components=topics,
                                    max_iter=iters,
                                    random_state=42,
                                    learning_method='online',
                                    n_jobs=-1)
    
    lda_dat = lda.fit_transform(vect_data)
    
    def display_topics(model, feature_names, no_top_words):
        for ix, topic in enumerate(model.components_):
            print("Topic ", ix)
            print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
    
    display_topics(lda, vectorizer.get_feature_names(),20)
    
    return vectorizer, vect_data, lda, lda_dat

In [7]:
t0 = time()
vect_mod, vect_data, lda_mod, lda_data = topic_mod_lda(cleaned_talks,
                                                       topics=20,
                                                       iters=50,
                                                       ngram_min=1, 
                                                       ngram_max=2, 
                                                       max_df=0.5, 
                                                       max_feats=2000)
print("\nLDA done in %0.3fs." % (time() - t0))

Topic  0
water ocean fish sea coral plastic boat animal whale light marine blue bottle oil ship swim specie area deep river
Topic  1
word language book god religion story culture century knowledge compassion english history religious believe course read consciousness identity self tradition
Topic  2
gene dna disease virus genome molecule genetic bacteria cell vaccine organism million hiv biology probably case technology malaria protein understand
Topic  3
story love guy old man wanted feel friend told took home night oh moment later happened away remember knew week
Topic  4
art sound music play sort image piece object hand eye body artist video experience create color looking film real mind
Topic  5
kid school student game teacher education learning child learn high play percent teach class video number answer course getting high school
Topic  6
earth planet foot ice mars water air space fly surface mile cloud meter atmosphere shark sun satellite temperature moon mountain
Topic  7
city

## NMF

In [8]:
def topic_mod_nmf(data, topics=5,iters=10,ngram_min=1, ngram_max=3, max_df=0.6, max_feats=5000):
      
    vectorizer = CountVectorizer(ngram_range=(ngram_min,ngram_max), 
                             stop_words='english', 
                             max_df = max_df, 
                             max_features=max_feats)
    
    vect_data = vectorizer.fit_transform(data)
    nmf = NMF(n_components=topics, max_iter=iters, random_state=42)
    nmf_dat = nmf.fit_transform(vect_data)
    
    def display_topics(model_, feature_names, no_top_words):
        for ix, topic in enumerate(model_.components_):
            print("Topic ", ix)
            print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
    
    display_topics(nmf, vectorizer.get_feature_names(),20)
    
    return vectorizer, vect_data, nmf, nmf_dat

In [9]:
t0 = time()
vect_mod, vect_data, nmf_mod, nmf_data  = topic_mod_nmf(cleaned_talks,
                                                             topics=20,
                                                             iters=50,
                                                             ngram_min=1, 
                                                             ngram_max=2, 
                                                             max_df=0.5, 
                                                             max_feats=2000)
print("\nNMF done in %0.3fs." % (time() - t0))

Topic  0
love feel person guy man experience god friend old word hand wanted moment mind sort believe maybe looking thinking feeling
Topic  1
brain neuron body memory area animal sleep mind region ability control fly study behavior activity light turn consciousness understand arm
Topic  2
woman men girl man gender sex boy female young black male violence mother heart women sexual issue men woman daughter job
Topic  3
country government china africa global percent india economic state political united war economy growth social society money states united states democracy
Topic  4
water ocean planet earth animal sea fish specie mars surface area ice tree forest coral climate shark percent foot plant
Topic  5
cell stem stem cell organ body disease dna gene light tissue drug molecule bacteria animal structure technology patient bone virus material
Topic  6
school kid student teacher education high class money learning teach girl classroom community high school college old learn dollar pare

## LSA

In [10]:
def topic_mod_lsa(data, topics=5,ngram_min=1, ngram_max=3, max_df=0.6, max_feats=5000):
    
    vectorizer = CountVectorizer(ngram_range=(ngram_min,ngram_max), 
                             stop_words='english', 
                             max_df = max_df, 
                             max_features=max_feats)
   
    vect_data = vectorizer.fit_transform(data)
    lsa = TruncatedSVD(n_components=topics,random_state=42)
    lsa_dat = lsa.fit_transform(vect_data)
    
    def display_topics(model_, feature_names, no_top_words):
        for ix, topic in enumerate(model_.components_):
            print("Topic ", ix)
            print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
    
    display_topics(lsa, vectorizer.get_feature_names(),20)
    
    return vectorizer, vect_data, lsa, lsa_dat

In [11]:
t0 = time()
vect_mod, vect_data, lsa_mod, lsa_data  = topic_mod_lsa(cleaned_talks,
                                                        topics=20,
                                                        ngram_min=1, 
                                                        ngram_max=2, 
                                                        max_df=0.5, 
                                                        max_feats=2000)
print("\nLSA done in %0.3fs." % (time() - t0))

Topic  0
country woman story percent child brain technology school example million course kid number city old data feel sort love maybe
Topic  1
brain cell light technology body computer animal cancer neuron planet space earth data robot information universe water machine using sort
Topic  2
brain woman cell men cancer child body girl love patient story disease neuron man mother feel baby heart boy memory
Topic  3
country cell cancer percent disease brain health patient drug africa data government dollar china growth global blood food market care
Topic  4
woman water planet cancer earth cell men universe space ocean light energy black star mars galaxy body specie solar sun
Topic  5
brain country woman planet neuron energy earth men china water universe global africa climate area region power ocean billion economic
Topic  6
child school kid water food city brain family planet animal ocean earth parent fish mother area old percent plant teacher
Topic  7
city building brain car design wom

## LSA + TfIdf

In [12]:
def topic_mod_lsa_t(data, topics=5,ngram_min=1, ngram_max=3, max_df=0.6, max_feats=5000):
    
    vectorizer = TfidfVectorizer(ngram_range=(ngram_min,ngram_max), 
                             stop_words='english', 
                             max_df = max_df, 
                             max_features=max_feats)
    
    vect_data = vectorizer.fit_transform(data)
    stdScale = Normalizer()
    vect_scale = stdScale.fit_transform(vect_data)
    lsa_t = TruncatedSVD(n_components=topics,random_state=42)
    lsa_t_dat = lsa_t.fit_transform(vect_scale)
    
    def display_topics(model_, feature_names, no_top_words):
        for ix, topic in enumerate(model_.components_):
            print("Topic ", ix)
            print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
    
    display_topics(lsa_t, vectorizer.get_feature_names(),20)
    
    return vectorizer, vect_data, lsa_t, lsa_t_dat

In [13]:
t0 = time()
vect_mod, vect_data, lsa_t_mod, lsa_t_data  = topic_mod_lsa_t(cleaned_talks,
                                                              topics=20,
                                                              ngram_min=1, 
                                                              ngram_max=2, 
                                                              max_df=0.5, 
                                                              max_feats=2000)
print("\nLSA_T done in %0.3fs." % (time() - t0))

Topic  0
woman child story country city brain technology kid percent school love data water feel family design community word old sort
Topic  1
woman men girl child school country family story kid mother boy father young man war violence parent education love community
Topic  2
city country government percent global africa economy dollar climate water energy china company building market money business economic billion oil
Topic  3
cancer cell patient brain disease drug health percent country doctor gene woman data tumor treatment medical medicine care blood hiv
Topic  4
woman water ocean planet earth animal sea specie fish men plant ice forest mars tree coral universe energy girl light
Topic  5
city kid school cancer building design patient child cell teacher community neighborhood car architecture disease doctor health project student food
Topic  6
kid school child teacher student education ocean food learning classroom parent water fish planet africa teach teaching percent learn gra

# Comparing the above topics and the words they are comprised of, we can observe that LDA performs the best.

## Label the topics

In [10]:
topic_ind = np.argmax(lda_data, axis=1)
topic_labels = pd.DataFrame(topic_ind)
topic_names = topic_labels
topic_names[topic_names==0] = "marine"
topic_names[topic_names==1] = "writing"
topic_names[topic_names==2] = "disease biology"
topic_names[topic_names==3] = "story"
topic_names[topic_names==4] = "human body"
topic_names[topic_names==5] = "education"
topic_names[topic_names==6] = "nature"
topic_names[topic_names==7] = "architecture"
topic_names[topic_names==8] = "technology, medium"
topic_names[topic_names==9] = "family"
topic_names[topic_names==10] = "build"
topic_names[topic_names==11] = "medicine"
topic_names[topic_names==12] = "global economy"
topic_names[topic_names==13] = "technology, energy"
topic_names[topic_names==14] = "politics"
topic_names[topic_names==15] = "medicine"  
topic_names[topic_names==16] = "social"
topic_names[topic_names==17] = "animal"
topic_names[topic_names==18] = "market"
topic_names[topic_names==19] = "space"

## Retrieve similar documents using NN

In [8]:
def get_simi(first_article,num_of_recs,topics,ted_data, model, vectorizer, training_vectors):
    
    new_vec = model.transform(
        vectorizer.transform([first_article]))
    
    nn = NearestNeighbors(n_neighbors=num_of_recs, metric='cosine', algorithm='brute')
    nn.fit(training_vectors)
    
    results = nn.kneighbors(new_vec)
    
    recommend_list = results[1][0]
    scores = results[0]
                       
    ss = np.array(scores).flat       
    for i, resp in enumerate(recommend_list):
        print('\nID: ', + resp)
        print('Cosine Distance: ', + ss[i])  
        print('Topics: ' + topics.iloc[resp,0])
        print('URL: ' + ted_data.iloc[resp,1])
        print("TED's original tags: ")
        print(ted_data.iloc[resp,-3])
        print("\n------------------------")
        
    return recommend_list, ss  

In [11]:
rec_list, scores = get_simi(cleaned_talks[804],10, topic_names, ted_all,
                                       lda_mod, vect_mod, lda_data)


ID:  804
Cosine Distance:  2.220446049250313e-16
Topics: human body
URL: https://www.ted.com/talks/charles_limb_your_brain_on_improv

TED's original tags: 
Your brain on improv

------------------------

ID:  1907
Cosine Distance:  0.029006092710062714
Topics: medicine
URL: https://www.ted.com/talks/laura_schulz_the_surprisingly_logical_minds_of_babies

TED's original tags: 
The surprisingly logical minds of babies

------------------------

ID:  602
Cosine Distance:  0.046728506250262036
Topics: medicine
URL: https://www.ted.com/talks/pawan_sinha_on_how_brains_learn_to_see

TED's original tags: 
How brains learn to see

------------------------

ID:  1488
Cosine Distance:  0.05830870117550302
Topics: human body
URL: https://www.ted.com/talks/steve_ramirez_and_xu_liu_a_mouse_a_laser_beam_a_manipulated_memory

TED's original tags: 
A mouse. A laser beam. A manipulated memory.

------------------------

ID:  1752
Cosine Distance:  0.06126162343640251
Topics: medicine
URL: https://ww