In [2]:
#!git clone https://github.com/Leah-u/Automatic-Topic-Classification-Using-NLP-Topic-Models

Cloning into 'Automatic-Topic-Classification-Using-NLP-Topic-Models'...
remote: Enumerating objects: 41, done.[K
remote: Counting objects: 100% (41/41), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 41 (delta 13), reused 34 (delta 10), pack-reused 0[K
Unpacking objects: 100% (41/41), done.


In [3]:
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
import re
import gensim
import gensim.corpora as corpora
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import pyLDAvis
from pyLDAvis import sklearn as sklearn_lda

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/patsnap/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/patsnap/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
#Load data file
dataset = pd.read_csv(r'/Users/patsnap/Desktop/Neo4J_and_other_codes/Automatic-Topic-Classification-Using-NLP-Topic-Models/Research Paper Topic Classification using the NLP Topic Model NMF/Data/research_papers.csv', encoding='ISO-8859–1')
dataset.head()

Unnamed: 0,Id,Reference,Authors,Title,Year,Conference/ Journal,Codes,Abstract,Conclusion
0,1,"Ludi, S., & Spencer, M. (2017). Design Consid...","Ludi, S., & Spencer, M.",Design Considerations to Increase Block-based ...,2017.0,Journal of Visual Languages and Sentient Sys...,"BBL, DES, SR, KN, CE, CN, DT, CC",Block-based programming languages are a popula...,The initial version of accessible Blockly shou...
1,2,"Ludi, S., Simpson, J., & Merchant, W. (2016, O...","Ludi, S., Simpson, J., & Merchant, W.",Exploration of the use of auditory cues in cod...,2016.0,ACM SIGACCESS Conference on Computers and Acce...,"AP, CN, CC, BBL, DES, DT, ACUE, SM",Visual programming languages are commonplace i...,
2,3,"Ludi, S. (2015, October). Position paper: Towa...","Ludi, S.",Towards making block-based programming accessi...,2015.0,IEEE Blocks and Beyond Workshop. IEEE.,"BBL, DES, DT, WAPP, PC",Block-based programming environments are not a...,
3,4,"Milne, L. R., & Ladner, R. E. (2018, April). B...","Milne, L. R., & Ladner, R. E.\r",Blocks4All: overcoming accessibility barriers ...,2018.0,CHI Conference on Human Factors in Computing S...,"BBL, CH, CN, CC, SR, TS, SM, KDG, MSH",Blocks-based programming environments are a po...,We conducted an evaluation of current blocks-b...
4,5,"Caraco, L. B., Deibel, S., Ma, Y., & Milne, L....","Caraco, L. B., Deibel, S., Ma, Y., & Milne, L....",Making the Blockly Library Accessible via Touc...,2019.0,In The 21st International ACM SIGACCESS Confer...,"BBL, TS, SR, DES, CN, CE",Block-based programming environments are a pop...,We present two interfaces which were designed ...


In [5]:
#Remove the unecessary columns
dataset = dataset.drop(columns=['Id', 'Reference', 'Codes', 'Authors', 'Year', 'Conference/ Journal'], axis=1)
#Fill in the empty cells
dataset = dataset.fillna('No conclusion')
#Merge abstract and conclusion
dataset['Paper_Text'] = dataset["Abstract"] + dataset["Conclusion"]
#show first 5 records
dataset.head()

Unnamed: 0,Title,Abstract,Conclusion,Paper_Text
0,Design Considerations to Increase Block-based ...,Block-based programming languages are a popula...,The initial version of accessible Blockly shou...,Block-based programming languages are a popula...
1,Exploration of the use of auditory cues in cod...,Visual programming languages are commonplace i...,No conclusion,Visual programming languages are commonplace i...
2,Towards making block-based programming accessi...,Block-based programming environments are not a...,No conclusion,Block-based programming environments are not a...
3,Blocks4All: overcoming accessibility barriers ...,Blocks-based programming environments are a po...,We conducted an evaluation of current blocks-b...,Blocks-based programming environments are a po...
4,Making the Blockly Library Accessible via Touc...,Block-based programming environments are a pop...,We present two interfaces which were designed ...,Block-based programming environments are a pop...


In [6]:
#function for lemmatization
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
# tokenization
tokenized_data = dataset['Paper_Text'].apply(lambda x: x.split())
# Remove punctuation
tokenized_data = tokenized_data.apply(lambda x: [re.sub('[-,()\\!?]', '', item) for item in x])
tokenized_data = tokenized_data.apply(lambda x: [re.sub('[.]', ' ', item) for item in x])
# turn characters to lowercase
tokenized_data = tokenized_data.apply(lambda x: [item.lower() for item in x])
# remove stop-words
stop_words = stopwords.words('english')
stop_words.extend(['from','use', 'using','uses','user', 'users', 'well', 'study', 'survey', 'think'])
# remove words of length less than 3
tokenized_data = tokenized_data.apply(lambda x: [item for item in x if item not in stop_words and len(item)>3])
# lemmatize by calling lemmatization function
tokenized_data= tokenized_data.apply(lambda x: [get_lemma(item) for item in x])

In [7]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(tokenized_data, min_count=5, threshold=10) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[tokenized_data], threshold=10)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
# Define functions for creating bigrams and trigrams.
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]
# Form Bigrams
tokenized_data_bigrams = make_bigrams(tokenized_data)
# Form Trigrams
tokenized_data_trigrams = make_trigrams(tokenized_data)

In [8]:
# de-tokenization, combine tokens together
detokenized_data = []
for i in range(len(dataset)):
    t = ' '.join(tokenized_data_trigrams[i])
    detokenized_data.append(t)
dataset['clean_text']= detokenized_data
documents = dataset['clean_text']

In [9]:
#Set variable number of terms
no_terms = 1000
# NMF uses the tf-idf count vectorizer
# Initialise the count vectorizer with the English stop words
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, max_features=no_terms, stop_words='english')
# Fit and transform the text
document_matrix = vectorizer.fit_transform(documents)
#get features
feature_names = vectorizer.get_feature_names()

In [13]:
#Set variables umber of topics and top words.
no_topics = 10
no_top_words = 10
# Function for displaying topics
def display_topic(model, feature_names, num_topics, no_top_words):
    print("Model Result:")
    word_dict = {}
    for i in range(num_topics):
      #for each topic, obtain the largest values, and add the words they map to into the dictionary.
        words_ids = model.components_[i].argsort()[:-no_top_words - 1:-1]
        words = [feature_names[key] for key in words_ids]
        word_dict['Topic # ' + '{:02d}'.format(i)] = words
    dict = pd.DataFrame(word_dict)
    #dict.to_csv('%s.csv' % model_name)
    return dict
# Apply NMF topic model to document-term matrix
nmf_model = NMF(n_components=no_topics, random_state=42, alpha=.1, l1_ratio=.5, init='nndsvd').fit(document_matrix)

In [14]:
display_topic(nmf_model, feature_names, no_topics, no_top_words)

Model Result:


Unnamed: 0,Topic # 00,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07,Topic # 08,Topic # 09
0,program,child,blind_learner,blind_student,blockbased_programming,blind_developer,structjumper,story,sight_programmer,solution
1,auditory,learning,audio_programming_language,barrier,blockly,challenge,skimming,block,blind_programmer,blind
2,auditory_cue,torino,apl,computer_science,computing,developer,participant,accessible,blind,development
3,nonsighted,design,skill,project,environment,face,structure,blockbased_programming_language,area,aid
4,computer,computational,novice_blind_learner,structjumper,library,software_developer,blind_programmer,tangible,reading,description_language
5,research,inclusive,solve_problem,interview,impaired,software,complete,student,prioritize,graphic_interface
6,sound,physical,interact,information,design,difficulty,class,game,method,regard
7,blind,vision,motivating,create,visually,ides,eclipse,output,difference,usage
8,cue,teacher,help,goal,outreach,blind,navigate,audio,suggest,graphical_interface
9,comprehension,nonspecialist,write_program,dissertation,foster,need,nest,novice_programmer,screen_reader,problem


In [11]:
#Use NMF model to assign topic to papers in corpus
nmf_topic_values = nmf_model.transform(document_matrix)
dataset['NMF Topic'] = nmf_topic_values.argmax(axis=1)
#Save dataframe to csv file
dataset.to_csv('final_results.csv')
dataset.head(10)

Unnamed: 0,Title,Abstract,Conclusion,Paper_Text,clean_text,NMF Topic
0,Design Considerations to Increase Block-based ...,Block-based programming languages are a popula...,The initial version of accessible Blockly shou...,Block-based programming languages are a popula...,blockbased_programming_language popular means ...,4
1,Exploration of the use of auditory cues in cod...,Visual programming languages are commonplace i...,No conclusion,Visual programming languages are commonplace i...,visual_programming_language commonplace engage...,0
2,Towards making block-based programming accessi...,Block-based programming environments are not a...,No conclusion,Block-based programming environments are not a...,blockbased_programming environment accessible ...,4
3,Blocks4All: overcoming accessibility barriers ...,Blocks-based programming environments are a po...,We conducted an evaluation of current blocks-b...,Blocks-based programming environments are a po...,blocksbased_programming environment popular to...,1
4,Making the Blockly Library Accessible via Touc...,Block-based programming environments are a pop...,We present two interfaces which were designed ...,Block-based programming environments are a pop...,blockbased_programming environment popular lea...,4
5,Expanding Blocks4All with Variables and Funct...,Blocks-based programming environments are ofte...,Our enhancements to Blocks4All have improved i...,Blocks-based programming environments are ofte...,blocksbased_programming environment often inac...,1
6,An Accessible Blocks Language: Work in Progress.,Block languages are extensively used to introd...,No conclusion,Block languages are extensively used to introd...,block language extensively use introduce progr...,7
7,Nonvisual Visual Programming.,Visual programming systems are widely used to ...,"Nonvisual visual programming is possible, and ...",Visual programming systems are widely used to ...,visual programming system widely use introduce...,1
8,Accessible AST-Based Programming for Visually-...,Most programmers rely on visual tools (block-b...,While this evaluation focused exclusively on u...,Most programmers rely on visual tools (block-b...,programmer rely visual tool blockbased editor ...,6
9,Programming microworlds for visually impaired...,The paper describes our research aimed at veri...,Our environments are physical micro-worlds [11...,The paper describes our research aimed at veri...,paper_describe research aim verification suita...,1
