In [1]:
import pandas as pd 
import tensorflow
from bs4 import BeautifulSoup
import re
import timeit
import requests
import wikipedia

In [4]:
def crawl_year(year, yearContent, df):
    """ Crawl the different years of the wikipedia's archieved deletion discussions page and store the content
        in a Data Frame. 

        Args: 
            year: the year in which the archived articles were flagged for deletion
            yearContent: the html content containing the year (a h2 tag)
            df: the data frame where the data are stored in the format year | month | title | Id | Gender

        Returns:
            a data frame
    """
    for monthContent in yearContent.find_next_siblings(limit=24):
        if monthContent.name == "h2":
            # Crawl only this year. If the year doesn't yet have 12 months(e.g. 2019), don't go for more.
            break
        elif monthContent.name == "h3":
            month = monthContent.get_text().split(str(year)+" ")[1].split("[")[0]
            print("Month",month)
        elif monthContent.name == "ul":
            # Go through the list of days
            for dayRelative in monthContent.find_all("a"):
                print(dayRelative['href'])
                dayPageLink = "https://en.wikipedia.org/"+dayRelative['href']
                try :
                    dayPage = requests.get(dayPageLink)
                except requests.exceptions.RequestException as e:
                    continue
                soupPage = BeautifulSoup(dayPage.content, "html.parser")

                if dayPage.status_code == 200:
                    
                    # Get the number of articles in a particular day
                    # From the beginning till the june 2006 wikipedia has a different HTML code on this
                    if (int(year) < 2006 | ((int(year) == 2006) & (month in ['June', 'May', 'April', 'March', 'February', 
                                                                          'January']))):
                        try:
                            articlesLength = float(soupPage.find_all("li", {"class": "toclevel-2"})[-1].get_text().split(" ")[0])
                        except Exception:
                            continue
                        nrLength = len(str(articlesLength).split(".")[1])
                        if nrLength == 2:
                            articlesLength = round(articlesLength%1 * 100,2)
                        elif nrLength == 3:
                            articlesLength = round(articlesLength%1 * 1000,3)
                    else:
                        try:
                            articlesLength = float(soupPage.find_all("ul")[2].find_all("li")[-1].get_text().split(" ")[0])
                        except ValueError:
                            try:
                                articlesLength = float(soupPage.find_all("ul")[0].find_all("li")[-1].get_text().split(" ")[0])
                            except ValueError:
                                continue
                        numberDec = round(articlesLength % 1 * 10, 2)
                        if int(numberDec) != numberDec:
                            numberDec *= 10
                        articlesLength = int(articlesLength) + numberDec
                    
                    print("Articles to be crawled in this page: ",articlesLength)

                    # Every article is located in an <h3> tag
                    for article in soupPage.find_all("h3", limit = articlesLength):
                        try:
                            # Don't read deleted articles
                            if article.find("a")['title'].find("(page does") == -1:
                                articleTitle = article.get_text().split("[")[0]
                                pageLink = "https://en.wikipedia.org"+article.find("a")['href']
                                df = crawl_article(year, month, articleTitle, pageLink, df)
                        except Exception as e:
                            continue
    return df


In [5]:
def crawl_article(year, month, title, pageLink, df):
    """ Crawl the content of the corresponding dbpedia page of a wikipedia article in order to get its id and gender.
        Store an entry in the dataframe.

        Args:
            year: the year in which the current article was flagged for deletion
            month: the month in which the current article was flagged for deletion
            articleTitle: the title of the article flagged for deletion
            pageLink: the wikipedia link of the article
            df: the data frame where the data are stored in the format year | month | title | Id | gender

        Returns:
            A data frame
    """
    url = "http://dbpedia.org/page/"+pageLink.split("/wiki/")[1]
    try :
        dbpediaPage = requests.get(url)
    except requests.exceptions.RequestException as e:
        return df
    soup = BeautifulSoup(dbpediaPage.content, "html.parser")
    wikiIdTag = soup.find("span", {"property":"dbo:wikiPageID"})
    genderTag = soup.find("span", {"property":"foaf:gender"})
    if genderTag == None:
        # Not a person
        return df
    dic = {"Year":year, "Month":month, "Tile":title, "Id": wikiIdTag.contents[0]
           , "Gender":genderTag.contents[0]}
    if df.empty:
        df = pd.DataFrame(data=dic, index=[0])
    else:
        df_temp = pd.DataFrame(data=dic, index=[0])
        df = pd.concat([df, df_temp])
    return df

In [6]:
startTime = timeit.default_timer()

seedURL = "https://en.wikipedia.org/wiki/Wikipedia:Archived_deletion_discussions#Deletion_discussions/"
archivePage = requests.get(seedURL)
soup = BeautifulSoup(archivePage.content, "html.parser")

# Get the year
years = []
yearContents = []
for yearContent in soup.find_all("h2", limit=17):
    year = yearContent.get_text().split("[")[0]
    if year == "Contents":
        continue
    years.append(year)
    yearContents.append(yearContent)

# print(years[10])
df = pd.DataFrame()
df = crawl_year(years[13], yearContents[13], df)

elapsedTime = timeit.default_timer() - startTime
print("Crawl time ", elapsedTime)

Month December
/wiki/Wikipedia:Articles_for_deletion/Log/2006_December_31


TypeError: unsupported operand type(s) for &: 'int' and 'str'

In [None]:
df

In [23]:
export_csv = df.to_csv (r'C:\Users\neiral\WS_semester2\CSS\nominatedForDeletion\2014.csv', index = None, header=True)

In [2]:
df2 = pd.read_csv('2013.csv')
df = pd.read_csv('2014.csv')

In [10]:
# print(df2)
print(len(df2))
print(len(df))

2592
3231


In [5]:
articlesLength = 1.195
print(articlesLength%1*1000)
numberDec = round(articlesLength % 1 * 1000, 3)
print(numberDec)
if numberDec >= 10 & isinstance(numberDec, int):
    temp = numberDec / 10
    if isinstance(temp, int):
        numberDec = temp
else:
    while isinstance(numberDec, float):
        numberDec *= 10
print(numberDec)
articlesLength = int(articlesLength) + numberDec
print("Articles to be crawled in this page: ",articlesLength)


195.00000000000006
195.0
195.0
Articles to be crawled in this page:  196.0


In [22]:
articlesLength = 1.19
articlesLength = round(articlesLength%1 * 1000,3)
print(articlesLength)

190.0


In [27]:
articlesLength = 1.193
nrLength = len(str(articlesLength).split(".")[1])
if nrLength == 2:
    articlesLength = round(articlesLength%1 * 100,2)
elif nrLength == 3:
    articlesLength = round(articlesLength%1 * 1000,3)
print(articlesLength)

193.0
