In [1]:
import pandas as pd 
import tensorflow
from bs4 import BeautifulSoup
import re
import timeit
import requests
import wikipedia

In [2]:
def crawl_year(year, yearContent, df):
    """ Crawl the different years of the wikipedia's archieved deletion discussions page and store the content
        in a Data Frame. 

        Args: 
            year: the year in which the archived articles were flagged for deletion
            yearContent: the html content containing the year (a h2 tag)
            df: the data frame where the data are stored in the format year | month | title | content | about_women

        Returns:
            a data frame
    """
    counter = 0
    for monthContent in yearContent.find_next_siblings(limit=24):
        try:
            if monthContent.name == "h2":
                # Crawl only this year. If the year doesn't yet have 12 months, don't go for more.
                break
            if monthContent.name == "h3":
                month = monthContent.get_text().split(str(year)+" ")[1].split("[")[0]
                print("Month",month)
            if monthContent.name == "ul":
                for dayRelative in monthContent.find_all("a"):
                    print(dayRelative['href'])
                    dayPageLink = "https://en.wikipedia.org/"+dayRelative['href']
                    try :
                        dayPage = requests.get(dayPageLink)
                    except requests.exceptions.RequestException as e:
                        continue
                    soupPage = BeautifulSoup(dayPage.content, "html.parser")

                    if dayPage.status_code == 200:
                        try:
                            # Find the last article to get the length of the articles
                            articlesLength = int(soupPage.find_all("ul")[2].find_all("li")[-1].get_text().split(" ")[0])
                        except ValueError:
                            continue

                        # Every article is located in an <h3> tag
                        for article in soupPage.find_all("h3", limit = articlesLength):

                            # Don't read deleted articles
                            if article.find("a")['title'].find("(page does") == -1:
                                articleTitle = article.get_text().split("[")[0]
    #                             pageLink = "https://en.wikipedia.org"+article.find("a")['href']
                                df = crawl_article(year, month, articleTitle, df)
        except Exception:
            continue
    return df


In [3]:
def crawl_article(year, month, articleTitle, df):
    """ Crawl the content of a wikipedia article and store an entry in the dataframe.

        Args:
            year: the year in which the current article was flagged for deletion
            month: the month in which the current article was flagged for deletion
            articleTitle: the title of the article flagged for deletion
            df: the data frame where the data are stored in the format year | month | title | content | about_women

        Returns:
            A data frame
    """
    try: 
        p = wikipedia.page(articleTitle)
    except wikipedia.exceptions.DisambiguationError as e1:
        return df
    except wikipedia.exceptions.PageError as e2:
        return df
    dic = {"Year":year, "Month":month, "Title": p.title, "Content": p.content, "About_women":0}
    if df.empty:
        df = pd.DataFrame(data=dic, index=[0])
    else:
        df_temp = pd.DataFrame(data=dic, index=[0])
        df = pd.concat([df, df_temp])
    return df

In [None]:
startTime = timeit.default_timer()

seedURL = "https://en.wikipedia.org/wiki/Wikipedia:Archived_deletion_discussions#Deletion_discussions/"
archivePage = requests.get(seedURL)
soup = BeautifulSoup(archivePage.content, "html.parser")

# Get the year
years = []
yearContents = []
for yearContent in soup.find_all("h2", limit=7):
    year = yearContent.get_text().split("[")[0]
    if year == "Contents":
        continue
    years.append(year)
    yearContents.append(yearContent)
    
df = pd.DataFrame()
df = crawl_year(years[0], yearContents[0], df)

elapsedTime = timeit.default_timer() - startTime
print("Crawl time ", elapsedTime)

Month June
/wiki/Wikipedia:Articles_for_deletion/Log/2019_June_5


In [None]:
df.to_csv(r'C:\Users\neiral\WS_semester2\CSS\nominatedForDeletion\articles.txt', header=None, index=None, sep=' ', mode='a')

In [None]:
print(type(df.loc[df['Title'] == "Amir Yusuf Pohan"].Content))