Date: 15/09/2019

Version: 1.0

Environment: Python 3 and Jupyter notebook


## Task 1: Generating a sparse represtation for Paper Bodies

### Importing libraries 

In [1]:
# !pip install pdfminer.six
#installing pdfminer.six 


import pdfminer
import io
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

import re

import pandas as pd

from urllib import request

import requests

import os 

import nltk

from nltk import ngrams

from nltk.probability import FreqDist

from nltk.stem import PorterStemmer

## 1. Downloading PDF files from given URLs

#### Note: An empty pdf folder was manually created, into which the pdf files were downloaded. An empty pdf folder must exist for the following code to work.

In [2]:
'''
Defining a function to convert pdf to text to perform processing on the text
'''

def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text

In [3]:
links =  convert_pdf_to_txt("./Group058.pdf")
#converting given pdf to a text

### Extracting the URLs from the text

In [4]:
links = re.sub(r'\n', '    ', links)
links = re.sub('PP[0-9]{4}.pdf', '', links)
links = re.sub("filename", '', links)
links = re.sub(r"url", '', links)
links = re.sub(r"\x0c", '', links)
#using python's regular expression to extract URLs from the text

links = links.split()
links

['https://drive.google.com/uc?export=download&id=1en9aeKBC5kxIJ5-40Hng1ONLl-s-3-Wz',
 'https://drive.google.com/uc?export=download&id=1h1ZApxxSZ_0HlGiBmqnFQvbWysAti-UR',
 'https://drive.google.com/uc?export=download&id=1u2TPAS9i-gRoQAYRVbIaDdY_2ktejzRL',
 'https://drive.google.com/uc?export=download&id=1rdknW_Zh7pWWxk920Lq0sgxXPYVa9HdV',
 'https://drive.google.com/uc?export=download&id=1VBdqTsR-1bWuHUUAGVHBxnselpxe4jlQ',
 'https://drive.google.com/uc?export=download&id=1ie4Jz3CplWE3jSyXQ0tzXWq6n_BNfh4d',
 'https://drive.google.com/uc?export=download&id=1hYyivtsETqrkYvI-Il6T9MvEHBtVdmXA',
 'https://drive.google.com/uc?export=download&id=1Y6ISUUQY9eb7ClcqijYMVWEH6GLmyszx',
 'https://drive.google.com/uc?export=download&id=1SGdLJfnrL2qZq0jaLzBo4Xu_msD5x-Cv',
 'https://drive.google.com/uc?export=download&id=10jdrmb06Eo4eM0sH9zF5EtN7MCIOQxcZ',
 'https://drive.google.com/uc?export=download&id=1tPOQS-G-lPREZpcAFyh2SC2-v8AE6EC6',
 'https://drive.google.com/uc?export=download&id=1CZenT8GPErdPM_t

### Downloading files from each URL

In [5]:
#Downloading files by iterating through the Link list and assigning names to them using a counter named "i"

i = 1
for item in links:
    i = str(i)
    url = item
    pdfName = str(r"./PDF/" + i + r".pdf")
    theFile = requests.get(url)
    open(pdfName, 'wb').write(theFile.content)
    i = int(i)
    i += 1

## 2. Reading the PDF files into text and extracting the required entities

### a. Extracting paper bodies from each of downloaded PDFs

In [6]:
#Extracting the Paper bodies from each pdf and making a list out of 200 bodies

bodies = [''] * 200
#initialising list to store paper bodies frome each pdf

for i in range(1,201):
    i = str(i)
    fileName = "./PDF/" + i + ".pdf"
    fileContents = convert_pdf_to_txt(fileName)
    fileContents = re.sub('\n', ' ', fileContents)
    start = re.search('1 Paper Body', fileContents).start()
    end = re.search('2 References', fileContents).end()
    fileContents = fileContents[start:end-14]
    fileContents = re.sub('1 Paper Body', '', fileContents)
    i = int(i)
    bodies[i-1] = fileContents

### b. Sentence Segmentation 

In [7]:
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
#loading packages for sentence detection

segmenting = bodies

for i in range(len(bodies)):
    segmenting[i] = sent_detector.tokenize(bodies[i].strip())

### c.  first word of each Sentence to lower case

In [8]:
lowerCasing = segmenting

for i in range(len(segmenting)):
    for j in range(len(segmenting[i])):
        lowerCasing[i][j] = segmenting[i][j][0].lower() + segmenting[i][j][1:]

### d. Word Tokenisation

In [9]:
#Splitting sentences into tokens based on the given regular expression

tokens = [''] * len(lowerCasing)

for i in range(len(lowerCasing)):
    tokens[i] = str(lowerCasing[i])
    tokens[i] = re.findall(r"[A-Za-z]\w+(?:[-'?]\w+)?", tokens[i])

###  e. Finding Bigrams

In [10]:
bigramTokens = [[] for i in range(len(tokens))]

token = [''] * len(tokens)

for i in range(len(tokens)):
    token[i] = str(tokens[i])
    token[i] = re.sub(r"\[", '', token[i])
    token[i] = re.sub(r"\]", '', token[i])
    token[i] = re.sub(r"\'", '', token[i])
    token[i] = re.sub(r"\'", '', token[i])
    token[i] = re.sub(r"\,", '', token[i])
    
    n = 2
    twograms = ngrams(token[i].split(), n)
    #using ngrams function to find bigrams
    for grams in twograms:
        bigramTokens[i].append(grams)

        
uniqueBigramTokens = [''] * len(bigramTokens)

for i in range(len(bigramTokens)):
    uniqueBigramTokens[i] = list(set(bigramTokens[i]))
 
    #Converting unique tokens into x_y format for calculating IDFs

for i in range(len(uniqueBigramTokens)):
    for j in range(len(uniqueBigramTokens[i])):
        uniqueBigramTokens[i][j] = uniqueBigramTokens[i][j][0] + "_" + uniqueBigramTokens[i][j][1]


In [11]:
bigramTk = bigramTokens

In [12]:
#Creating a single list of Bigrams
bigramCombined = []

for i in range(len(bigramTokens)):
    for item in bigramTokens[i]:
        bigramCombined.append(item)

## f. Filtering Bigrams

### Removing context independent stop words and word with length less than 3 from the Bigrams list

In [13]:
#reading the given context independent stop word list
stopWords = open('stopwords_en.txt', 'r').read()
stopWords = stopWords.split('\n')

bigramTokens_NoStopWords = []

for i in range(len(bigramCombined)):
    x,y = bigramCombined[i]
    if x.lower() in stopWords or y.lower() in stopWords or len(x) < 3 or len(y) < 3:
        pass
    else:
        bigramTokens_NoStopWords.append(x + "_" + y)

### Removing context dependent stop words and rare tokens

In [14]:
#Calculating frequency of bigrams and taking top 4000 (we need only 200, taking top 4000 to reduce computation time)

bigramFreqs = FreqDist(bigramTokens_NoStopWords).most_common(4000)
#bigramFreqs

In [15]:
#Calculating IDF for the bigrams

bigramidf = {}
#initialising a dictionary to store IDF values for each bigram

for item in bigramFreqs:
    x,y = item
    count = 0
    for i in range(len(uniqueBigramTokens)):
        if x in uniqueBigramTokens[i]:
            count += 1
        bigramidf[item] = count

#bigramidf

In [16]:
#Filtering bigrams and removing bigrams with rare tokens (threshold = 3%) and context-independent tokens (threshold = 95%) 
bigramIDF = {}

upperThreshold = 0.95 * len(tokens)
lowerThreshold = .03 * len(tokens)

for key in bigramidf:
    if bigramidf[key] <= upperThreshold and bigramidf[key] > lowerThreshold:
        bigramIDF[key] = bigramidf[key]
        
#bigramIDF

In [17]:
#Taking only bigram and frequence and then sorting them

topBigrams = []

for key in bigramIDF:
    x,y = key
    tup = (x,y)
    topBigrams.append(tup)

topBigrams = sorted(topBigrams, key = lambda tup: (-tup[1], tup[0]))
topBigrams = topBigrams[:200]
#topBigrams

In [18]:
bigrams = [] 

for i in range(len(topBigrams)):
        bigrams.append(topBigrams[i][0])
#bigrams    

## g. Finding Unigrams

### Removing Stop Words and Words with a length of less than 3

In [19]:
tokens2 = [[]for i in range(len(tokens))]
tokens3 = [[]for i in range(len(tokens))]

for i in range(len(tokens)):
    for item in tokens[i]:
        if item not in stopWords:
            tokens2[i].append(item)

for i in range(len(tokens2)):
    for item in tokens2[i]:
        if len(item) > 3:
            tokens3[i].append(item)            

### Stemming tokens using PorterStemmer

In [20]:
stemmer = PorterStemmer()
#loading stemmer function 

stemmedTokens = [[] for i in range(len(tokens))]
#creating a list to store stemmed tokens

for i in range(len(tokens3)):
    for item in tokens3[i]:
        if item is item.lower():
            item = stemmer.stem(item)
            stemmedTokens[i].append(item)
        else:
            stemmedTokens[i].append(item)
#for loop to stem each word in tokens2

### Adding all unigrams into a single list

In [21]:
stemmedTokensCombined = []
#crearting a list to store all unigrams

for i in range(len(stemmedTokens)):
    for items in stemmedTokens[i]:
        stemmedTokensCombined.append(items)


In [22]:
#creating a set of tokens for each pdf file

uniqueStemmedTokens = [''] * len(stemmedTokens) 
#creating an empty list to store unique tokens from each pdf

for i in range(len(stemmedTokens)):
    uniqueStemmedTokens[i] = list(set(stemmedTokens[i]))

### Filtering out Rare tokens and context dependent stop words from Unigrams

In [23]:
#calculating frequency of unigrams and taking top 4000 (we need only 200, taking top 4000 to reduce computation time)

unigramFreqs = FreqDist(stemmedTokensCombined).most_common()
# unigramFreqs

### Calculating IDF for unigrams

In [24]:
unigramidf = {}
#initialising a dictionary to store IDF values for each unigram

for x,y in unigramFreqs:
    count = 0
    for items in uniqueStemmedTokens:
        if x in items:
            count += 1
    unigramidf[(x,y)] = count
    

In [25]:
#Filtering unigrams and removing bigrams with rare tokens (threshold = 3%) and context-independent tokens (threshold = 95%) 

unigramIDF = {}
#creating dictionary to store filtered unigrams

upperThreshold = 0.95 * len(tokens)
lowerThreshold = .03 * len(tokens)

for key in unigramidf:
    if unigramidf[key] > upperThreshold or unigramidf[key] < lowerThreshold:
        pass
    else:
        unigramIDF[key] = unigramidf[key]

# unigramIDF

In [26]:
unigramIdf = []

for key in unigramIDF:
    unigramIdf.append(key)

In [27]:
unigrams = []

for i in range(len(unigramIdf)):
    unigrams.append(unigramIdf[i][0])


## h. Creating vocabulary (unigrams and bigrams combined)

In [28]:
vocab = unigrams + bigrams
#combining words in unigrams and bigrams and storing it in vocab

vocab = sorted(vocab)
#sorting words in vocab

In [29]:
vocabDict = {}
#creating a dictionary to store words in vocan with a count

i = 0
for item in vocab:
    vocabDict[item] = i
    i+= 1

### Creating a list with ID's assigned to the words 

In [30]:
#initialising tokensID to store  token ids for independent bodies
tokensID = [[] for i in range(len(tokens))]

#Iterating through items in tokens and appending the same using their ID's
for i in range(len(tokens)):
    for item in tokens[i]:
        if item in unigrams:
            tokensID[i].append(vocabDict[item])
# tokensID

##### Substituting words with ID for bigrams

In [31]:
#Creating a list of bigram tokens with the IDs assigned to them
bigTokens = bigramTk
bigramTokens = [[] for i in range(len(bigTokens))]

for i in range(len(bigTokens)):
    for x,y in bigTokens[i]:
        #Converting them to the format saved in the dictionary
        item = x + "_" + y
        bigramTokens[i].append(item)
# bigramTokens

In [32]:
bigramID = [[] for i in range(len(bigramTokens))]

#Storing them as IDs 
for i in range(len(bigramTokens)):
    for item in bigramTokens[i]:
        if item in bigrams:
            bigramID[i].append(vocabDict[item])

# bigramID

##### Unigram final counts

In [33]:
#Taking frequency count of the unigrams from tokensID
unigramFinalCount = [[] for i in range(len(tokensID))]

for i in range(len(tokensID)):
    unigramFinalCount[i] = FreqDist(tokensID[i]).most_common()

##### Bigram final count

In [34]:
#Taking frequency count of the bigrams from tokensID
bigramFinalCount = [[] for i in range(len(bigramID))]

for i in range(len(bigramID)):
    bigramFinalCount[i] = FreqDist(bigramID[i]).most_common()

In [35]:
#Retrieving the unique file codes to create the required vector
fileID =  convert_pdf_to_txt("./Group058.pdf")

fileID = re.sub('https(.*?)\\n', '', fileID)
fileID = re.sub('filename\\n\\nurl\\n\\n', '', fileID)
fileID = re.sub('\\n', '', fileID)
fileID = re.sub('\\x0c', '', fileID)
fileID = re.sub('\.pdf', '', fileID)
fileID = fileID.split()

# fileID

In [36]:
#Storing the vector contents into a variable called vector (Adding contents from Unigram and Bigram)
vector = ''

for i in range(len(fileID)):
    vector = vector + fileID[i] + ","
    for x,y in unigramFinalCount[i]:
        vector = vector + str(x) + ":" + str(y) + ","
    for x,y in bigramFinalCount[i]:
        vector = vector + str(x) + ":" + str(y) + ","
    vector = vector + "\n"
    

In [37]:
#Line before new file starts
vector = re.sub(',\n', '\n', vector)

In [38]:
#Opening file and writing contents of variable vector into it
vectorFile = open("./Group058_count_vectors.txt","w+")
vectorFile.write(vector)

826729

In [39]:
# vocabDict

In [40]:
#Writing the vocabulary (The dictionary with words-[Unigrams and Bigrams] and unique ID)
vocabStr = ''
voc = open("./Group058_vocab.txt", "w+", encoding= "utf-8")

for key in vocabDict:
    vocabssss = str(key) + ":" + str(vocabDict[key])  
    voc.write(vocabssss + '\n')
    


# Task 2: Generating CSV file containing top 10 most frequent words in titles, authors and abstracts

## 1. Finding Titles

### a. Extracting Titles from pdf files

In [41]:
#Extracting the Titles from each pdf 

titles = [''] * 200
#initialising list to store titles from each pdf file

for i in range(1,201):
    i = str(i)
    fileName = "./PDF/" + i + ".pdf"
    #reading each pdf in the PDF folder
    
    fileContents = convert_pdf_to_txt(fileName)
    #coverting pdf to file
    
    fileContents = re.sub('\n', ' ', fileContents)
    end = re.search('Authored by:', fileContents).end()
    title = fileContents[0:end-14]
    #finding start and end index of titles in each pdf and then extracting the title
        
    i = int(i)
    titles[i-1] = title
    #storing titles

## b. Performing Tokenization on Titles

In [42]:
title_tokens = [''] * len(titles)
#initialising list to store title tokens

for i in range(len(titles)):
    title_tokens[i] = str(titles[i])
    title_tokens[i] = re.sub('-\s','',title_tokens[i])
    title_tokens[i] = re.findall(r"[A-Za-z]\w+(?:[-'?]\w+)?", title_tokens[i])


## c. Normalising title tokens to lower case

In [43]:
for i in range(len(title_tokens)):
    for j in range(len(title_tokens[i])):
        title_tokens[i][j] =  title_tokens[i][j].lower()

## d. Removing context-independent stop words

In [44]:
title_tokens2 = [[]for i in range(len(title_tokens))]
#initialising list to store filtered tokens

for i in range(len(title_tokens)):
    for words in title_tokens[i]:
        if words not in stopWords:
            title_tokens2[i].append(words)
#filtering out stopwords

## e. Creating a word count on Unique Tokens

In [45]:
tkn = []
#initialising list to store unique tokens

for i in range(len(title_tokens2)):
    for word in title_tokens2[i]:
        tkn.append(word)
#storing all tokens from each pdf into a single variable         

tkn_set = list(set(tkn))
#storing unique tokens in tkn_set

In [46]:
l = len(tkn_set)
val = [0]*l
#creating list of values to be zipped into a dictionary

title_dict = dict(zip(tkn_set,val))
#creating a dictionary for title tokens

for word in tkn:
    if word in tkn_set:
        title_dict[word] +=1
#generating count for each word in the dictionary

## f. Creating a data frame with top 10 most frequently used words in Titles

In [47]:
df = pd.DataFrame.from_records([title_dict])
df = pd.melt(df)
#creating data frame for title tokens


In [48]:
df = df.sort_values(['value','variable'], ascending = [False,True])
#sorting words in dataframe in decreasing order of frequency and alphabetically 

df = df.drop_duplicates(subset=['value'], keep='first').reset_index().drop(columns=['index'])
#removing duplicates for each frequences 

df = df.reset_index().drop(df[df.index>9].index).drop(columns=['index'])
#getting top ten most frequently used tokens in titles

df

Unnamed: 0,variable,value
0,learning,50
1,models,16
2,analysis,15
3,model,14
4,inference,13
5,networks,12
6,optimization,10
7,neural,9
8,bayesian,8
9,clustering,7


## 2. Extracting Authors

### a. Extracting Authors from each PDF file

In [49]:
#Extracting the Authors from each pdf 

authors = [''] * 200
#initialising list to store authors from each pdf file

for i in range(1,201):
    i = str(i)
    fileName = "./PDF/" + i + ".pdf"
    #reading each pdf in the PDF folder
    
    fileContents = convert_pdf_to_txt(fileName)
    #coverting pdf to file
    
    start = re.search('Authored by:', fileContents).start()
    end = re.search('\nAbstract', fileContents).end()
    author = fileContents[start+14:end-10]
    #finding start and end index of authors in each pdf and then extracting the authors
    
    i = int(i)
    authors[i-1] = author


## b. Tokenizing Individual Authors

In [50]:
authors1 = [''] * 200
#initialising list to store indivual author names

for i in range(len(authors)):
    authors1[i] = authors[i].split('\n')
    authors1[i] = list(filter(None, authors1[i]))


## c. Creating a Word Count on Unique Author Names

In [51]:
all_authors = []
#intialising list to store unique author names

for i in range(len(authors1)):
    for word1 in authors1[i]:
        all_authors.append(word1)
        
#creating a single list containg author names from all pdfs

author_set = list(set(all_authors))
#creating a list of unique authors

In [52]:
l = len(author_set)
val = [0]*l
#initialising list of values to be zipped with unique author names

author_dict = dict(zip(author_set,val))
#creating an author dictionary

for word in all_authors:
    if word in author_set:
        author_dict[word] +=1
#generating a count for each author name from all pdfs

## f. Creating a data frame with top 10 most frequent authors

In [53]:
authors_df = pd.DataFrame.from_records([author_dict])
authors_df = pd.melt(authors_df)
#creating data frame for authors

authors_df.head()

Unnamed: 0,variable,value
0,Aapo Hyv?rinen,1
1,Aaron Dennis,1
2,Aaron Roth,1
3,Abbas Bazzi,1
4,Abhinav Gupta,1


In [54]:
authors_df = authors_df.sort_values(['value','variable'], ascending = [False,True]).reset_index().drop(columns=['index'])
#sorting words in dataframe in decreasing order of frequency and alphabetically 

authors_df = authors_df.reset_index().drop(authors_df[authors_df.index>9].index).drop(columns=['index'])
#getting top ten most frequently used authors

authors_df

Unnamed: 0,variable,value
0,Christopher R?,4
1,Zoubin Ghahramani,4
2,Inderjit S. Dhillon,3
3,Joshua B. Tenenbaum,3
4,Lawrence Carin,3
5,Masashi Sugiyama,3
6,Michael W. Mahoney,3
7,Ryan P. Adams,3
8,Shinichi Nakajima,3
9,Stefano Ermon,3


## 3. Extracting Abstracts

### a. Extracting Abstracts from each PDF file

In [55]:
#Extracting the Abstracts from each pdf 

abstracts = [''] * 200
#initialising list to store individual abstracts

for i in range(1,201):
    i = str(i)
    fileName = "./PDF/" + i + ".pdf"
    #reading each pdf in the PDF folder
    
    fileContents = convert_pdf_to_txt(fileName)
    #coverting pdf to file
    
    fileContents = re.sub('\n', ' ', fileContents)
    start = re.search('Abstract', fileContents).start()
    end = re.search('1 Paper Body', fileContents).end()
    abstract = fileContents[start+8:end-12]
    #finding start and end index of abstracts in each pdf and then extracting the abstracts
    
    i = int(i)
    abstracts[i-1] = abstract.strip()


## b. Performing Sentence Segmentation for Abstracts

In [56]:
abstract_tk1 = abstracts
#creating duplicate list

for i in range(len(abstract_tk1)):
    abstract_tk1[i] = sent_detector.tokenize(abstract_tk1[i].strip())
#Sentence segmentation on individual

## c. Normalising first letter of each sentence to lower-case

In [57]:
for i in range(len(abstract_tk1)):
    for j in range(len(abstract_tk1[i])):
        abstract_tk1[i][j] = abstract_tk1[i][j][0].lower() + abstract_tk1[i][j][1:]
#normalising the first word of each line to lower case

## d. Performing Word Tokenisation on Abstracts

In [58]:
abstract_tokens = [[] for i in range(len(abstract_tk1))]
#initialising a list to store abstract tokens

for i in range(len(titles)):
    abstract_tokens[i] = str(abstract_tk1[i])
    abstract_tokens[i] = re.sub('-\s','',abstract_tokens[i])
    abstract_tokens[i] = re.findall(r"[A-Za-z]\w+(?:[-'?]\w+)?", abstract_tokens[i])


## e. Removing context independent Stop Words

In [59]:
abstract_tokens2 = [[]for i in range(len(abstract_tokens))]
#initialising a list to store filtered tokens

for i in range(len(abstract_tokens)):
    for words in abstract_tokens[i]:
        if words.lower() not in stopWords:
            abstract_tokens2[i].append(words)
#filtering out stopwords

## f. Creating a Word Count on unique words

In [60]:
all_words = []
#initialising a list to store unique words

for i in range(len(abstract_tokens2)):
    for word in abstract_tokens2[i]:
        all_words.append(word)
#appending all words from each pdf into a single variable

word_set = list(set(all_words))
#getting unique set of words

In [61]:
l = len(word_set)
val = [0]*l
#initialising list to be zippef with word set

word_dict = dict(zip(word_set,val))
#creating a dictionary for unique words

for word in all_words:
    if word in word_set:
        word_dict[word] +=1
#generating word count for each word in the dictionary

## g. Creating a data frame with top 10 most frequently used words in Abstracts

In [68]:
word_df = pd.DataFrame.from_records([word_dict])
word_df = pd.melt(word_df)
#creating a dataframe for all words in abstracts

word_df.head()

Unnamed: 0,variable,value
0,ADMM,3
1,ADMiRA,1
2,AMP,1
3,Abstract,1
4,Action,1


In [69]:
word_df2 = word_df.sort_values(['value','variable'], ascending = [False,True])
#sorting words in dataframe in decreasing order of frequency and alphabetically 

word_df2 = word_df2.drop_duplicates(subset=['value'], keep='first').reset_index().drop(columns=['index'])
#removing duplicates for each frequences

word_df2 = word_df2.reset_index().drop(word_df2[word_df2.index>9].index).drop(columns=['index'])
#getting top ten most frequently used words in abstracts

word_df2

Unnamed: 0,variable,value
0,learning,182
1,model,176
2,algorithm,134
3,data,131
4,problem,119
5,algorithms,111
6,show,109
7,models,94
8,methods,92
9,approach,81


## 4. Generating a CSV file containing top 10 most frequently used terms in titles, frequent authors, and frequently used terms in abstracts

In [70]:
df2 = df.reset_index().drop(columns=['value'])
authors_df2 = authors_df.reset_index().drop(columns=['value'])
word_df2 = word_df.reset_index().drop(columns=['value'])
#dropping the count from each of the dataframes

stats_df = pd.merge(word_df2, df2, on='index')
stats_df2 = pd.merge(stats_df, authors_df2, on='index').drop(columns=['index']).rename(columns ={'variable_x':'top10_terms_in_abstracts','variable_y':'top10_terms_in_titles','variable':'top10_authors'}).set_index('top10_terms_in_abstracts')
#merging stats into a single dataframe

stats_df2


Unnamed: 0_level_0,top10_terms_in_titles,top10_authors
top10_terms_in_abstracts,Unnamed: 1_level_1,Unnamed: 2_level_1
ADMM,learning,Christopher R?
ADMiRA,models,Zoubin Ghahramani
AMP,analysis,Inderjit S. Dhillon
Abstract,model,Joshua B. Tenenbaum
Action,inference,Lawrence Carin
Adaptive,networks,Masashi Sugiyama
Al-Sc,optimization,Michael W. Mahoney
Alan,neural,Ryan P. Adams
Allocation,bayesian,Shinichi Nakajima
Analysis,clustering,Stefano Ermon


In [72]:
stats_df2.to_csv('./Group058_stats.csv')
#generating csv file