# Assignment2 - Text Analytics

Purpose: Implement Bag of Words on wikilit webpage and get count the occurrences of all words in the documents. Construct a bag-of-words matrix.

In [1]:
# Import required libraries
# Import urllib.request library for opening URLs
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import numpy as np

# 1) Data Preparation & Analysis
Get and read the wikilit web page and show the first 2000 characters to make sure everything is working correctly.

In [2]:
webpage = 'http://wikilit.referata.com/wiki/Special:Ask/-5B-5BCategory:Publications-5D-5D/-3FHas-20author%3DAuthor(s)/-3FYear/-3FPublished-20in/-3FAbstract/-3FHas-20topic%3DTopic(s)/-3FHas-20domain%3DDomain(s)/_format%3D-20csv/limit%3D-20100/offset%3D0'
webcontent = urlopen(webpage).read()

print(webcontent[:2000])

b'<!DOCTYPE html>\n<html lang="en" dir="ltr" class="client-nojs" version="HTML+RDFa 1.0">\n<head>\n<meta charset="UTF-8"/>\n<title>[[Category:Publications]]</title>\n<script>document.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );</script>\n<script>window.RLQ = window.RLQ || []; window.RLQ.push( function () {\nmw.config.set({"wgCanonicalNamespace":"Special","wgCanonicalSpecialPageName":"Ask","wgNamespaceNumber":-1,"wgPageName":"Special:Ask/-5B-5BCategory:Publications-5D-5D/-3FHas-20author=Author(s)/-3FYear/-3FPublished-20in/-3FAbstract/-3FHas-20topic=Topic(s)/-3FHas-20domain=Domain(s)/_format=-20csv/limit=-20100/offset=0","wgTitle":"Ask/-5B-5BCategory:Publications-5D-5D/-3FHas-20author=Author(s)/-3FYear/-3FPublished-20in/-3FAbstract/-3FHas-20topic=Topic(s)/-3FHas-20domain=Domain(s)/ format=-20csv/limit=-20100/offset=0","wgCurRevisionId":0,"wgRevisionId":0,"wgArticleId":0,"wgIsArticle":!1,"wgIsRedirect":!1,"wgAction

## 1.1) Pseudo code
1) First look at the html version of the WikiLit page  
    * The table content is under the table class "sortable wikitable smwtable". Under this table, we have table heading th, table body tbody, table rows trs with classes row-odd and row-even  
    * Under each 'tr' class, we have table division td clases as follows:  
        - URL of the Title
        - Author(s)	
        - Year	
        - Published in	
        - Abstract	
        - Topic(s)	
        - Domain(s)
2) Read the html via BeautifulSoup  
3) Extract all info under the table class "sortable wikitable smwtable". This will be one single list.  
4) Now, read this list and extract info corresponding to the different td classed and export to a csv.  

In [3]:
resp = requests.get(webpage)
soup = BeautifulSoup(resp.content, 'lxml')

## 1.2) HTML Structure
<img src="html_Wikilit.png" width=1000px>

In [36]:
print(type(soup))
#print(soup)

<class 'bs4.BeautifulSoup'>


## 1.3) Selecting all data for the tag with class =  sortable wikitable smwtable& creating wikitable
Wikitable is part of the result and has only the tags under table. The other tags are
    - thead - table header
    - tr - table row
    - td - cell having the classes Author(s), Year, Published in, Abstract, Topic(s), Domain(s)

In [37]:
table_classes = {"class": ["sortable wikitable smwtable"]}
wikitable = soup.findAll("table", table_classes)
#wikitable

In [6]:
type(wikitable) 

bs4.element.ResultSet

In [7]:
column_headers = [th.getText() for th in soup.findAll('th')]
column_headers

['\xa0',
 'Author(s)',
 'Year',
 'Published in',
 'Abstract',
 'Topic(s)',
 'Domain(s)']

# 2) Text Data Cleaning
    - Read the contents of the wikitable
    - get all the rows with the tag tr
    - within each row class tr, get the cells corresponding to tags td and th. Note that th class has the table header and td has all the non-header data
    - within each cell, get just the text data
    - clean the text data stripping unwanted spaces
    - Write this text data(list) to a csv file.    

In [8]:
def process_text(cell):
    """ This function reads the cell from wikitable, extracts the text data
    cleans the text and returns the list of text that was extracted from each cell td class"""
    text_list = []
    for data in cell:
        text = data.findAll(text=True)
        clean_text = (
            ''.join(text) 
            .strip()
        )
        text_list += [clean_text]
    return text_list            

In [9]:
## Create the code/logic and write to the csv file

with open('wikilit_fin.csv', 'w', newline= '', encoding='utf-8') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',',
                            quoting=csv.QUOTE_ALL)
    # Each cell consists of <th> or <td> tag
    for index, table in enumerate(wikitable):

        for row in table.findAll('tr'):
            cell = row.findAll(['th', 'td'])
            if cell:
                wiki_text = process_text(cell)
                csvwriter.writerow(wiki_text)


In [38]:
## Create a new dataframe from the csv file having wikilit information

wiki_df = pd.read_csv('wikilit_fin.csv', encoding='utf-8')
wiki_df.head()

Unnamed: 0.1,Unnamed: 0,Author(s),Year,Published in,Abstract,Topic(s),Domain(s)
0,"'Wikipedia, the free encyclopedia' as a role m...",Gordon Müller-SeitzGuido Reger,2010,International Journal of Technology Management,Accounts of open source software (OSS) develop...,Contributor motivationPolicies and governanceS...,Information systems
1,A 'resource review' of Wikipedia,Cormac Lawler,2006,Counselling & Psychotherapy Research,"The article offers information on Wikipedia, a...",Miscellaneous topics,Information systems
2,A Persian web page classifier applying a combi...,Mojgan FarhoodiAlireza YariMaryam Mahmoudi,2009,International Journal of Information Studies,There are many automatic classification method...,Text classification,Computer science
3,A Wikipedia literature review,Owen S. Martin,2010,ArXiv,This paper was originally designed as a litera...,Literature review,Mathematics
4,A Wikipedia matching approach to contextual ad...,Alexander N. PakChin-Wan Chung,2010,World Wide Web,Contextual advertising is an important part of...,Other information retrieval topics,Computer science


# 3) Count the occurrences of all words in the documents.

I have 2 scenarios to count the occurrences of all words in the documents:  
    * Create a wikilitcorpus with the values from all documents irrespective of fields. Avoid the common words by using stop words and create a bag of words matrix. Count the occurrences by not including the stopwords as well  
    * Create a corpus with just the abstract information from the dataframe  

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

## 3.1 Creating wikilit corpus with all words from the documents 

In [12]:
wikilitcorpus = wiki_df.values.tolist()
## Note that this is a list of lists and hence I need another logic to create a single list from this corpus
len(wikilitcorpus)

100

In [13]:
nw = []
for i in range(len(wikilitcorpus)):
    nw += wikilitcorpus[i]

In [14]:
# Using newwiki in countvectorizer results in error as it has non-string
# Converting a list to a string using map
newwiki = [''.join(map(str, nw))]

### 3.1.1 Without using stopwords

In [15]:
wvec = CountVectorizer()
wikilit_cnt = wvec.fit_transform(newwiki)

In [16]:
# Summarize
print(wikilit_cnt.shape)

(1, 3677)


In [17]:
# Bag of Words matrix
print(wikilit_cnt.toarray())

[[3 2 1 ... 1 1 1]]


In [39]:
#print(wvec.vocabulary_) ## This lists all the words and the corresponding indices. 

In [19]:
print(wvec.get_feature_names()[100:300])## This lists the list of words aka bag of words in alphabetical order.

['absence', 'abstract', 'abstractions', 'abstracts', 'academic', 'academics', 'accelerating', 'acceleration', 'acceptance', 'access', 'accessible', 'accommodation', 'according', 'account', 'accretion', 'accumulated', 'accuracy', 'accurate', 'accurately', 'achieve', 'achieved', 'achieves', 'achieving', 'acknowledged', 'acm', 'acquaintances', 'acquisition', 'across', 'act', 'acting', 'action', 'actions', 'activate', 'active', 'activities', 'activity', 'actual', 'actually', 'acute', 'adapt', 'adaptable', 'adaptation', 'adapted', 'adaptive', 'adaptively', 'add', 'added', 'addictive', 'adding', 'addition', 'additional', 'address', 'addressed', 'adhocracies', 'adhocracy', 'adjusting', 'adlerkrishnendu', 'adlerluca', 'administrative', 'admission', 'adolescent', 'adopt', 'adopted', 'adopting', 'ads', 'adult', 'advance', 'advanced', 'advances', 'advantage', 'advantages', 'advent', 'adversary', 'advertisers', 'advertising', 'advertisingalexander', 'aesthetic', 'affect', 'affected', 'affecting', 

In [20]:
freq_df = pd.DataFrame(wikilit_cnt.toarray(), columns=wvec.get_feature_names())
freq_df.head()

Unnamed: 0,000,0001,0004,04this,05,07,08,09,0robert,10,...,york,you,zeal,zealots,zengmaher,zhangweining,zhourong,zwol2009wsdm,årup,œplainâ
0,3,2,1,1,4,6,2,3,1,5,...,2,5,1,1,1,1,1,1,1,1


In [21]:
sumdf = freq_df.sum(axis=0)
pd.DataFrame({'Vocab': sumdf.index, 'Frequency': sumdf.values}).sort_values(by='Frequency', ascending=False).head()

Unnamed: 0,Vocab,Frequency
3282,the,938
2278,of,812
246,and,558
3325,to,439
1664,in,342


#### We can see that there are many common occuring words above. We need to filter them out.

### 3.1.2 Filter out common words by using stopwords 

In [22]:
wvecs = CountVectorizer(stop_words='english')
wikilit_stop_cnt = wvecs.fit_transform(newwiki)

In [23]:
print(wikilit_stop_cnt.shape)

(1, 3468)


#### Bag of words matrix

In [24]:
print(wikilit_stop_cnt.toarray())

[[3 2 1 ... 1 1 1]]


In [40]:
#print(wvecs.vocabulary_) ## This lists all the words and the corresponding indices. 

In [26]:
print(wvecs.get_feature_names()[100:300])## This lists the list of words aka bag of words in alphabetical order.

['abstractions', 'abstracts', 'academic', 'academics', 'accelerating', 'acceleration', 'acceptance', 'access', 'accessible', 'accommodation', 'according', 'account', 'accretion', 'accumulated', 'accuracy', 'accurate', 'accurately', 'achieve', 'achieved', 'achieves', 'achieving', 'acknowledged', 'acm', 'acquaintances', 'acquisition', 'act', 'acting', 'action', 'actions', 'activate', 'active', 'activities', 'activity', 'actual', 'actually', 'acute', 'adapt', 'adaptable', 'adaptation', 'adapted', 'adaptive', 'adaptively', 'add', 'added', 'addictive', 'adding', 'addition', 'additional', 'address', 'addressed', 'adhocracies', 'adhocracy', 'adjusting', 'adlerkrishnendu', 'adlerluca', 'administrative', 'admission', 'adolescent', 'adopt', 'adopted', 'adopting', 'ads', 'adult', 'advance', 'advanced', 'advances', 'advantage', 'advantages', 'advent', 'adversary', 'advertisers', 'advertising', 'advertisingalexander', 'aesthetic', 'affect', 'affected', 'affecting', 'affection', 'affective', 'afford

In [27]:
freq_df = pd.DataFrame(wikilit_stop_cnt.toarray(), columns=wvecs.get_feature_names())
freq_df.head()

Unnamed: 0,000,0001,0004,04this,05,07,08,09,0robert,10,...,yields,york,zeal,zealots,zengmaher,zhangweining,zhourong,zwol2009wsdm,årup,œplainâ
0,3,2,1,1,4,6,2,3,1,5,...,1,2,1,1,1,1,1,1,1,1


In [28]:
sumdf = freq_df.sum(axis=0)
pd.DataFrame({'Vocab': sumdf.index, 'Frequency': sumdf.values}).sort_values(by='Frequency', ascending=False).head()

Unnamed: 0,Vocab,Frequency
3371,wikipedia,261
1773,knowledge,95
1615,information,82
398,based,82
3347,web,72


## 3.2 Creating wikilit corpus with all words from only the abstract and filtering common words 

In [29]:
cvec = CountVectorizer(stop_words='english')
wikilit_abs_cnt = cvec.fit_transform(wiki_df['Abstract'])

In [30]:
# Bag of Words matrix
wikilit_abs_cnt.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [31]:
# Summarize
print(wikilit_abs_cnt.shape)

(100, 2748)


In [41]:
#print(cvec.vocabulary_) ## This lists all the words and the corresponding indices.

In [33]:
print(cvec.get_feature_names()[100:300]) ## This lists the list of words aka bag of words in alphabetical order.

['achieves', 'achieving', 'acknowledged', 'acquaintances', 'acquisition', 'act', 'acting', 'action', 'actions', 'activate', 'active', 'activities', 'activity', 'actual', 'actually', 'acute', 'adapt', 'adaptable', 'adaptation', 'adapted', 'adaptive', 'adaptively', 'add', 'added', 'addictive', 'adding', 'addition', 'additional', 'address', 'addressed', 'adhocracies', 'adhocracy', 'adjusting', 'administrative', 'admission', 'adopt', 'adopted', 'adopting', 'ads', 'advance', 'advanced', 'advances', 'advantage', 'advantages', 'advent', 'adversary', 'advertisers', 'advertising', 'aesthetic', 'affect', 'affected', 'affecting', 'affection', 'affective', 'afford', 'affordances', 'afforded', 'age', 'agencies', 'aggregate', 'aggregated', 'aggregation', 'ago', 'agreed', 'aim', 'aims', 'akin', 'algorithm', 'algorithms', 'allegedly', 'allow', 'allowed', 'allows', 'altered', 'alternative', 'alternatively', 'altruism', 'ambiguities', 'ambiguity', 'american', 'amid', 'amorphous', 'amounts', 'analogous',

#### Frequency table

In [34]:
freq_df = pd.DataFrame(wikilit_abs_cnt.toarray(), columns=cvec.get_feature_names())
freq_df.head()

Unnamed: 0,000,0001,0004,10,102,108,11,110,115,12,...,yahoo,yankees,year,years,yield,yields,york,zeal,zealots,œplainâ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
sumdf = freq_df.sum(axis=0)
pd.DataFrame({'Vocab': sumdf.index, 'Frequency': sumdf.values}).sort_values(by='Frequency', ascending=False).head()

Unnamed: 0,Vocab,Frequency
2704,wikipedia,237
1494,knowledge,74
331,based,70
246,article,62
2693,web,61


In [None]:
## From occurences to frequencies
from scikitlearn.