In [28]:
import numpy as np
# import pandas
import pandas as pd

# import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity

from nltk.corpus import stopwords
import string

In [29]:
data = '/Users/patrickokwir/Desktop/Git_Projects/Ted-Talks-Recommender-System/Data_output/talks.csv'
df = pd.read_csv(data, index_col=0)
df.head()

Unnamed: 0,author,talk,description,likes,views
0,Conor Russomanno,a powerful new neurotech tool for augmenting y...,in an astonishing talk and tech demo neurotech...,4700,157930
1,Peter Singer,a modern argument for the rights of animals,why do we prioritize human rights over those o...,7600,254482
2,Sahar Zand,why iranians are cutting their hair for woman ...,filmmaker sahar zand vividly explores the ongo...,1100,393882
3,Shannon Odell,are solar panels worth it,today in many countries solar is the cheapest ...,3700,126251
4,Angus Hervey,why are we so bad at reporting good news,why is good news so rare in a special broadcas...,1200,415329


In [30]:
# --------------------------------------------------Imports--------------------------------------------------#
import re
import nltk
import spacy
import gensim
import pyLDAvis
import nlp
nltk.download('stopwords')


# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

#--------------------------------------------------Things to Note--------------------------------------------------#
#The following are key factors to obtaining good segregation topics:**

#The quality of text processing.
#The variety of topics the text talks about.
#The choice of topic modeling algorithm.
#The number of topics fed to the algorithm.
#The algorithms tuning parameters.

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/patrickokwir/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
#___________________________________________________Data Processing__________________________________________________#

# Convert to list
data = df['description'].values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', i) for i in data]

# Remove new line characters in data
data = [re.sub('\s+', ' ', i) for i in data]

# Remove distracting single quotes
data = [re.sub("\'", "", i) for i in data]

# import pprint
from pprint import pprint

  data = [re.sub('\S*@\S*\s?', '', i) for i in data]
  data = [re.sub('\s+', ' ', i) for i in data]


In [32]:
#break down each sentence into a list of words through tokenization
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['in', 'an', 'astonishing', 'talk', 'and', 'tech', 'demo', 'conor', 'russomanno', 'shares', 'his', 'work', 'building', 'braincomputer', 'interfaces', 'that', 'could', 'enable', 'us', 'to', 'control', 'the', 'external', 'world', 'with', 'our', 'minds', 'he', 'discusses', 'the', 'quickly', 'advancing', 'possibilities', 'of', 'this', 'field', 'including', 'the', 'promise', 'of', 'closedloop', 'system', 'that', 'could', 'both', 'record', 'and', 'stimulate', 'brain', 'activity', 'and', 'invites', 'neurohacker', 'christian', 'bayerlein', 'onto', 'the', 'ted', 'stage', 'to', 'fly', 'mindcontrolled', 'drone', 'by', 'using', 'biosensing', 'headset']]


In [33]:
def strip_newline(series):
    return [review.replace('\n','') for review in series]

In [34]:
from nltk.corpus import stopwords


stop_words = stopwords.words('english')
stop_words.extend(['music', 'stephen', 'detail', 'way', 'shares', 'need', 'us', 'come','order','try','go','get','make','drink','plate','dish','restaurant','place', 'many', 'day', 'explains', 'even', 'part',
                  'would','really','like','great','service','came','got', 'talk', 'directed', 'ted', 'narrated', 'new', 'one', 'using', 'addison', 'anderson', 'says', "addison_anderson", "years", "first", 
                  'know', 'actually', 'worlds', 'could', 'details', 'studio', 'help', 'music', 'life', 'shows', 'world', 'good', 'think'])

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def bigrams(words, bi_min=10, tri_min=7):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

# def get_corpus(df):
#     df['text'] = df['review']
#     df['text'] = strip_newline(df.text)
#     words = list(sent_to_words(df.text))
#     words = remove_stopwords(words)
#     bigram_mod = bigrams(words)
#     bigram = [bigram_mod[review] for review in words]
#     id2word = gensim.corpora.Dictionary(bigram)
#     id2word.filter_extremes(no_below=10, no_above=0.35)
#     id2word.compactify()
#     corpus = [id2word.doc2bow(text) for text in bigram]
#     return corpus, id2word, bigram

In [35]:
words = remove_stopwords(data_words)
bigram = bigrams(words)
bigram = [bigram[review] for review in words]
id2word = gensim.corpora.Dictionary(bigram)
id2word.filter_extremes(no_below=5, no_above=0.1)
id2word.compactify()
corpus = [id2word.doc2bow(text) for text in bigram]


In [36]:
train_corpus4 = corpus
train_id2word4 = id2word
bigram_train4 = bigram

In [37]:
import warnings

In [38]:
# import logging
# logging.basicConfig(filename='lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    lda_train4 = gensim.models.ldamulticore.LdaMulticore(
                           corpus=train_corpus4,
                           num_topics=8,
                           id2word=train_id2word4,
                           chunksize=100,
                           workers=7, # Num. Processing Cores - 1
                           passes=50,
                           eval_every = 1,
                           per_word_topics=True)
    # lda_train4.save('lda_train4.model')

In [39]:
coherence_model_lda = CoherenceModel(model=lda_train4, texts=bigram_train4, dictionary=train_id2word4, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
coherence_lda

0.36194914273489537

In [40]:
lda_train4.print_topics(10,num_words=5)[:10]

[(0,
  '0.008*"women" + 0.008*"stories" + 0.008*"love" + 0.007*"story" + 0.006*"powerful"'),
 (1,
  '0.007*"art" + 0.006*"human" + 0.005*"see" + 0.005*"work" + 0.005*"design"'),
 (2,
  '0.012*"research" + 0.010*"disease" + 0.008*"cancer" + 0.007*"brain" + 0.007*"health"'),
 (3,
  '0.006*"planet" + 0.005*"water" + 0.005*"humanity" + 0.005*"future" + 0.005*"ocean"'),
 (4,
  '0.007*"global" + 0.006*"work" + 0.006*"business" + 0.005*"create" + 0.005*"social"'),
 (5,
  '0.009*"food" + 0.004*"water" + 0.004*"amazing" + 0.004*"theres" + 0.004*"system"'),
 (6,
  '0.008*"city" + 0.005*"global" + 0.005*"history" + 0.005*"ancient" + 0.005*"earth"'),
 (7,
  '0.009*"time" + 0.007*"work" + 0.004*"human" + 0.004*"future" + 0.004*"better"')]

In [41]:
# exract topics from the model
topics = lda_train4.show_topics(formatted=False)
for topic in topics:
    print("Topic", topic[0])
    print("Words:", [word[0] for word in topic[1]])
    print("Weights:", [word[1] for word in topic[1]])
    print("\n")

Topic 0
Words: ['women', 'stories', 'love', 'story', 'powerful', 'personal', 'tells', 'children', 'two', 'family']
Weights: [0.00786605, 0.0078099687, 0.00770874, 0.0071252896, 0.0058093467, 0.005576617, 0.0048611667, 0.0047532325, 0.0044897012, 0.004318806]


Topic 1
Words: ['art', 'human', 'see', 'work', 'design', 'science', 'space', 'technology', 'artist', 'language']
Weights: [0.0065333364, 0.005576193, 0.0054030404, 0.005280138, 0.0052148825, 0.0051552043, 0.004904514, 0.0042758016, 0.0041206265, 0.0040687593]


Topic 2
Words: ['research', 'disease', 'cancer', 'brain', 'health', 'medical', 'de', 'better', 'science', 'diseases']
Weights: [0.012373014, 0.009794811, 0.007653066, 0.006860262, 0.006642337, 0.0064943205, 0.0057874816, 0.005661902, 0.004217067, 0.0042109233]


Topic 3
Words: ['planet', 'water', 'humanity', 'future', 'ocean', 'global', 'ways', 'protect', 'plastic', 'may']
Weights: [0.006154424, 0.005258608, 0.004967137, 0.00472105, 0.0047130347, 0.004705911, 0.0043611135,

In [43]:
import nltk

# Download necessary resources for tokenization
nltk.download('punkt')

# Assuming you have already trained your LDA model and have it stored in the 'lda_model' variable

# Create a list to store the extracted topics
extracted_topics = []

# Iterate over each row in the dataframe
for index, row in df.iterrows():
    # Extract the text from the appropriate column in your dataframe
    text = row['description']
    
    # Tokenize the text into individual tokens
    tokens = nltk.word_tokenize(text)
    
    # Preprocess the tokens if necessary (e.g., removing stopwords, stemming, etc.)
    
    # Convert the tokens to a bag-of-words representation
    bow_vector = lda_train4.id2word.doc2bow(tokens)
    
    # Get the topic distribution for the document
    topic_distribution = lda_train4.get_document_topics(bow_vector)
    
    # Sort the topics by their probability in descending order
    sorted_topics = sorted(topic_distribution, key=lambda x: x[1], reverse=True)
    
    # Extract the most probable topic (assuming the topics are represented as tuples (topic_id, probability))
    most_probable_topic = sorted_topics[0][0]
    if most_probable_topic == 0:
        most_probable_topic = "Women's Rights"
    elif most_probable_topic == 1:
        most_probable_topic = "Art, Science, and Technology"
    elif most_probable_topic == 2:
        most_probable_topic = "Medicine and Health"
    elif most_probable_topic == 3:
        most_probable_topic = "Oceans and Clean Water"
    elif most_probable_topic == 4:
        most_probable_topic = "Bussiness and Economics"
    elif most_probable_topic == 5:
        most_probable_topic = "Food, Agriculture, and Water"
    elif most_probable_topic == 6:
        most_probable_topic = "History and Culture"
    elif most_probable_topic == 7:
        most_probable_topic = "Social Life"
    # Append the most probable topic to the extracted_topics list
    extracted_topics.append(most_probable_topic)
df['Topic'] = extracted_topics

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/patrickokwir/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [45]:
df.head()

Unnamed: 0,author,talk,description,likes,views,Topic
0,Conor Russomanno,a powerful new neurotech tool for augmenting y...,in an astonishing talk and tech demo neurotech...,4700,157930,Social Life
1,Peter Singer,a modern argument for the rights of animals,why do we prioritize human rights over those o...,7600,254482,Medicine and Health
2,Sahar Zand,why iranians are cutting their hair for woman ...,filmmaker sahar zand vividly explores the ongo...,1100,393882,Women's Rights
3,Shannon Odell,are solar panels worth it,today in many countries solar is the cheapest ...,3700,126251,Oceans and Clean Water
4,Angus Hervey,why are we so bad at reporting good news,why is good news so rare in a special broadcas...,1200,415329,Women's Rights
