In [1]:
import os
import pickle
import re
import numpy as np
import nltk
from gensim import corpora, models, similarities, matutils
from collections import defaultdict
import operator
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

Using TensorFlow backend.


# Question 1- Popular Topics

In [2]:
def chat_log_data(path):
    '''Function to extract chat data and user names and pickle them'''
    
    chat_data = []
    user_names = []
    
    #Extracting and storing the chat data and the user names
    
    for file in os.listdir(path):
        filepath = os.path.join(path, file)
        with open(filepath,'r') as f:
            for line in f:
                user_names.append(line.rstrip().split('\t')[1:3])
                chat_data.append(line.rstrip().split('\t')[-1])
     
    # Finding the unique user names to remove them from the chat data
    
    user_names_all = [re.sub('[^A-Za-z .-]+', ' ', j) for i in user_names for j in i]
    unique_user_names = list(set(user_names_all))
    
    # Cleaning the chat data to remove some special characters

    chat_data_cleaned = [re.sub('[^A-Za-z .-]+', ' ', i) for i in chat_data]
    
    #Storing pickles in local directory
    with open('comments.pkl', 'wb') as f:
        pickle.dump(chat_data_cleaned, f)
    
    with open('users.pkl', 'wb') as f:
        pickle.dump(unique_user_names, f)
    
    return chat_data_cleaned, unique_user_names

In [3]:
def noun_extraction(chat_data_cleaned):
    '''Function to tokenize and extract nouns from each sentence'''
    NOUNS = ['NN', 'NNS', 'NNP', 'NNPS']
    
    sentences = [nltk.word_tokenize(sent) for sent in chat_data_cleaned]
    sentences = [[w for w in sent if nltk.pos_tag([w])[0][1] in NOUNS]
                  for sent in sentences]
    
    with open('sentence_noun.pkl','wb') as f:
        pickle.dump(sentences, f)
        
    return sentences

In [4]:
chat_data_cleaned, unique_user_names = chat_log_data('data')

In [24]:
sentences = noun_extraction(chat_data_cleaned)

Creating a dictionary of the count of the nouns and sorting the dictionary in descending order to get most popular topics

In [19]:
words = [j for i in sentences for j in i]
words_count = defaultdict(int)
for word in words:
    words_count[word] += 1
    
words_count_dict = sorted(words_count.items(), key=operator.itemgetter(1),reverse=True)

In [57]:
words_count_dict[:11]

[('i', 343024),
 ('t', 152628),
 ('ubuntu', 133114),
 ('s', 116190),
 ('install', 101829),
 ('use', 91235),
 ('help', 63944),
 ('anyone', 61792),
 ('m', 60585),
 ('need', 59955),
 ('don', 52827)]

But the count of words gives only the words that are used more often, it hard to always find the topic with one word

It makes more sense to evaluate bi-grams- find the pairs of words that appear the most

In [21]:
#Obtain the bigrams from each sentence
bigram_list = []
for i in sentences:
    bigram_list.append(list(nltk.bigrams(i)))

#Flatten the bigram list for easy traversal to create bigram occurence dictionary
bigrams = [j for i in bigram_list for j in i] 

#Counting the occurence of the bigram
bigram_count = defaultdict(int)
for i in bigrams:
    bigram_count[i] += 1
bigram_count_dict = sorted(bigram_count.items(), key=operator.itemgetter(1),reverse=True)

In [58]:
bigram_count_dict[:11]

[(('don', 't'), 52174),
 (('doesn', 't'), 21141),
 (('i', 'm'), 18106),
 (('i', 'need'), 14077),
 (('sudo', 'apt-get'), 12323),
 (('apt-get', 'install'), 11996),
 (('i', 'want'), 11043),
 (('isn', 't'), 10483),
 (('i', 'think'), 9318),
 (('anyone', 'help'), 9292),
 (('i', 'use'), 9284)]

# Question 2- Topic Detector

In [64]:
#Extracting data from a given file
def single_file_chat_log(file_number):
    '''Extracting nouns of chat data from the input file'''
    
    chat_data = []

    path = 'data'
    filepath = os.path.join(path, str(file_number)+'.tsv')
    with open(filepath,'r') as f:
        for line in f:
            chat_data.append(line.rstrip().split('\t')[-1])

    chat_data_single_file = [re.sub('[^A-Za-z .-]+', ' ', i) for i in chat_data]

    # Extracting only the nouns from the sentences
    single_file_sentences = noun_extraction(chat_data_single_file)
                            
    return single_file_sentences

In [44]:
# Adding the user names to the stopwords matrix to remove occurences in the chat data
new_stopwords = new_stopwords = stopwords.words('english') + unique_user_names[1:]

In [59]:
def find_topic(texts, n_topics,thr=1e-2, **kwargs):
    """Return keywords of topics 
    """
    # Vectorizing the word matrix- using TFIDF Vectorizer
    vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc,stop_words=new_stopwords, lowercase=False)
    text_vec = vectorizer.fit_transform(texts)
    words = np.array(vectorizer.get_feature_names())
    
    # Applying NMF to obtain topics
    topicfinder = NMF(n_topics, **kwargs).fit(text_vec)
    topic_dists = topicfinder.components_ 
    topic_dists /= topic_dists.max(axis = 1).reshape((-1, 1))   
    
    #finding the keywords for the topics
    def _topic_keywords(topic_dist):
        keywords_index = np.abs(topic_dist) >= thr
        keywords_prefix = np.where(np.sign(topic_dist) > 0, "", "^")[keywords_index]
        keywords = " | ".join(map(lambda x: "".join(x), zip(keywords_prefix, words[keywords_index])))
        return keywords
    
    topic_keywords = map(_topic_keywords, topic_dists)
    return "\n".join("Topic %i: %s" % (i, t) for i, t in enumerate(topic_keywords))

In [60]:
# Change this number to any desired file number
file_number = 11

In [61]:
print('Topics of file number:', file_number)
print(find_topic(single_file_chat_log(file_number),1))

Topics of file number: 11
Topic 0: changes | config | grandr | panel | permanent | put | right | stops | sure | want
