This notebook was intended to create a Wiki Crawler for Turkish.

In [3]:
import requests
from bs4 import BeautifulSoup
import random

import time

import re
import string

In [4]:
def open_url(url):
    
    response = requests.get(url=url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    return soup

def scrapeWikiArticle(url):
         
    # OPEN URL
    soup = open_url(url)
    title = soup.find(id="firstHeading").text
    print(title)
    
    paragraph_texts = get_paragraphs(soup) # get the paragraph from the wiki
    
    
    good_list = find_turkish_wikis(soup,3) # CREATE URL LIST
        
    return good_list, paragraph_texts, title


#turkish_char = ['ç',"Ç",'ğ',"Ğ",'ı','ö',"Ö",'ş',"Ş",'ü',"Ü"] #,"I"

def find_turkish_wikis(soup,k):
    """
    Returns at most k turkish wiki link objects in a list.
    """
    
    allLinks = soup.find(id="bodyContent").find_all("a") # find new wikis
    random.shuffle(allLinks)
    
    good_list = []
    
    for link in allLinks:
        
        try: 
            
            if link['href'].find("/wiki/") == -1: # only links to other wikis
                continue
                
            link_title = link['title'] # find links with a title
            
            if (":" in link_title) or ('.' in link_title):  # don't include such pages                   
                continue 
             
            # check if the title includes a Turkish Character
            for char in ['ç',"Ç",'ğ',"Ğ",'ı','ö',"Ö",'ş',"Ş",'ü',"Ü"]: #,"I"

                if char in link_title: # Check Turkish characters
                    good_list.append(link)
                    break
            
            if len(good_list) == k:
                break
                    
        except:
            pass
                 
    return good_list

def get_paragraphs(soup):
    
    paragraphs = soup.find(id="bodyContent").find_all('p') # find all paragraphs

    paragraph_texts = []
    for p in paragraphs:
        cleaned_text = p.text.strip("\n")
        if cleaned_text: # if the paragraph is not empty
            paragraph_texts.append(cleaned_text)
            
    return paragraph_texts

In [11]:
max_iter = 50000

#url = "https://tr.wikipedia.org/wiki/%C4%B0mroz_Deniz_Muharebesi_(1918)"
#url = "https://tr.wikipedia.org/wiki/A%C4%9Fa%C3%A7"
url = "https://tr.wikipedia.org/wiki/Puma_(%C5%9Firket)"

good_starts = ["https://tr.wikipedia.org/wiki/A%C4%9Fa%C3%A7",
               "https://tr.wikipedia.org/wiki/%C4%B0mroz_Deniz_Muharebesi_(1918)",
              "https://tr.wikipedia.org/wiki/Karlovo,_Bulgaristan",
              "https://tr.wikipedia.org/wiki/Akdamar,_Ak%C3%A7aabat",
              "https://tr.wikipedia.org/wiki/Kahverengi_sa%C3%A7",
              "https://tr.wikipedia.org/wiki/Osmanl%C4%B1_Devleti%27nde_%C4%B0slamc%C4%B1l%C4%B1k",
              "https://tr.wikipedia.org/wiki/T%C3%BCm%C3%B6r",
              "https://tr.wikipedia.org/wiki/Hipokrat"]

good_start_idx = 0

start_time = time.time()

title_list = []
text_list = []
counter = 0
for i in range(max_iter):
    
    counter += 1

    good_list, paragraph_texts, title = scrapeWikiArticle(url) # scrape and make a query

    if title not in title_list:
        title_list.append(title)
        
        for p in paragraph_texts: # update text list if title not included yet
            text_list.append(p)

    if good_list: # If there is a page returned, look for new pages

        flag = False

        for link in good_list:

            try:
                url_try = "https://tr.wikipedia.org" + link['href'] # try to open the link
                soup_try = open_url(url_try)
                head_try = soup_try.find(id="firstHeading")
                if head_try and ("." not in head_try.text) and (":" not in head_try.text):
                    flag=True
                    break

            except:
                print("A page failed!")


        if flag: # if a good page found update url and start over
            
            url = url_try # update url
            
        else:# if all k websites fail, go back and search larger 
            
            new_soup = open_url(url)
            good_list = find_turkish_wikis(new_soup,9) # find k turkish pages
            
            
    else:
        print("\nNo new pages found! Resorting to known good pages.\n")
        url = good_starts[good_start_idx]
        good_start_idx += 1
        
            
    if (counter % 100) ==0 :
        print("\nCompletion: %{:.2f}".format(100*counter/max_iter))
        print("Time: {:.2f} minutes\n".format( (time.time()-start_time)/60 ))
        start_time = time.time()
           
    
print("\n{} paragraphs are returned!".format(len(text_list)))

Puma (şirket)
Kamerun millî futbol takımı
Cibuti millî futbol takımı
Etiyopya millî futbol takımı
Cezayir millî futbol takımı
Fas millî futbol takımı
Irak millî futbol takımı
AFC Asya Kupası
Uzatma süresi
Endirekt serbest vuruş
Taç atışı
Topun oyunda ve oyun dışı olması
Hakem atışı
Köşe vuruşu
Futbol kuralları
Direkt serbest vuruş
Ceza sahası
Kale alanı
Futbol sahası
Türkiye Futbol Federasyonu
Hasan Akıncıoğlu
Türkiye Futbol Federasyonu
Türkiye'deki futbol stadyumları listesi
Karaelmas Kemal Köksal Stadyumu
14 Eylül Stadyumu
Diyarbakır Atatürk Stadyumu
Diyarbakırspor
Coşkun Demirbakan
Yılmaz Vural
Erdoğan Yılmaz
Uğur Tütüneker
Erkan Sözeri
Hamza Hamzaoğlu
2011-12 Süper Lig
Hüseyin Avni Aker Stadyumu
Şamil Ekinci Müzesi
Sadri Şener Sosyal Tesisleri
Trabzonspor teknik direktörleri listesi
Kamuran Soykıray
Mehmet Ali Çınar
Hüsnü Özkara
Eyüp Arın
Türkiye Futbol Federasyonu
Türkiye millî futbol takımı
2008 Yaz Olimpiyatları'nda futbol
1992 Yaz Olimpiyatları'nda futbol
Uzatma süresi
İskoçya 

KeyboardInterrupt: 

In [12]:
len(text_list)

3641

In [14]:
def clean_text(text):
    text = re.sub('<.*?>', '', text)

    text = re.sub(r'\[\[[^]]*\]\]', '', text) # Get rid of [[ ]]
    text = re.sub(r'\[[^]]*\]', '', text)   #   Or [ ]

    text = re.sub(r'\{\{[^}]*\}\}', '', text)
    text = re.sub(r'\{[^}]*\}', '', text)

    text = re.sub(r'\(\([^)]*\)\)', '', text)
    text = re.sub(r'\([^)]*\)', '', text)

    text = re.sub(r'\=\=([^\=]*)\=\=', '', text)#   == ==
    text = re.sub(r'\<!--[^>]*\-->', '', text)# <!-- -->
    text = re.sub(r'(?m)^\*.*\n?', '', text)# *
    text = re.sub(r'(?m)^\=.*\n?', '', text)# =
    text = re.sub(r'(?m)^#.*\n?', '', text) # Lines with #
    text = re.sub(r'(?m)^\|.*\n?', '', text) # Lines with or symbol
    text = re.sub(r'(?m)^\s\|.*\n?', '', text)  # Lines with or symbol after whitespace

    text = re.sub(r'(?m)^from:.*\n?', '', text) #Lines starting with from:
    text = re.sub(r'(?m)^Resim:.*\n?', '', text)
    text = re.sub(r'(?m)^Dosya:.*\n?', '', text)
    text = re.sub(r'(?m)^!.*\n?', '', text)
    text = re.sub(r'(?m)^::.*\n?', '', text)
    text = re.sub(r'(?m)^;.*\n?', '', text)

    text = text.translate(str.maketrans("", "", string.punctuation))
    text = text.lower()
    
    text = text.replace(u'\xa0', u' ')
    text = re.sub(r'\n\n\n', '', text)


    #text = ''.join(ch for ch in text if ch not in exclude)
    return text

In [9]:
wiki_cleaned = [clean_text(text) for text in text_list if '\xa0' not in text]
print(len(wiki_cleaned))

11257


In [10]:
file_name = "Wiki_sentences2"

with open(file_name+".txt", 'w',encoding="utf16") as f:
    
    for sentence in wiki_cleaned:
        f.write(sentence+'\n')        

In [None]:
all_sentences = []

#for run in [one_run,second_run, third_run]:
for run in [aaa]:
    for p in run:

        m = re.search('\s+.\.\s',p) #find I. II. III. stuff
        if m:    
            end = m.end()-2
            p = x[:end] + x[end + 1:]

        m = re.search('[0-9]+\.+[0-9]+',p) #delete all numbers
        if m:
            start = m.start()
            end = m.end()
            #print(p[start:end])
            p = x[:start] + x[end + 1:]
                        

        m = re.search('\[.*?\]',p) # Delete references
        if m:
            start = m.start()
            end = m.end()
            p = x[:start] + x[end + 1:]
            
        sentences = p.split(". ")

        for sentence in sentences: 
            if sentence: # check empty string
            
                if sentence[-1] != ".": # enforce . in the end
                    sentence += "."
                    
                    all_sentences.append(sentence)
                    
print("There are {} sentences!".format(len(all_sentences)))

In [None]:
counter = 0

for p in one_run:
    counter += len(p)
    
print(counter)

text_list = []

#url = "https://tr.wikipedia.org/wiki/%C4%B0mroz_Deniz_Muharebesi_(1918)"
#url = "https://tr.wikipedia.org/wiki/A%C4%9Fa%C3%A7"
url = "https://tr.wikipedia.org/wiki/Puma_(%C5%9Firket)"

for i in range(500):
    
    url, paragraphs_list = scrapeWikiArticle(url)
    
    for p in paragraphs_list:
    
        text_list.append(p)
        
print("\n{} paragraphs are returned!".format(len(text_list)))


def scrapeWikiArticle(url): 
             
    response = requests.get(url=url)
    soup = BeautifulSoup(response.content, 'html.parser')

    title_obj = soup.find(id="firstHeading")
    
    title = title_obj.text
    if (":" in title) or ('.' in title):                     
        print("\nInvalid Title!!!!\n{}".format(url))
        
                
    print(title)
    #print(title.text.strip("\n"))
    
    paragraph_texts = get_paragraphs(soup) # get the paragraph from the wiki

    allLinks = soup.find(id="bodyContent").find_all("a") # find new wikis
    random.shuffle(allLinks)

    linkToScrape,flag = find_turkish_wiki(allLinks) # find a turkish one
    
    if not flag:
        print("Search Failed!")
    
    new_url = "https://tr.wikipedia.org" + linkToScrape['href']
    
    return new_url, paragraph_texts


turkish_char = ['ç',"Ç",'ğ',"Ğ",'ı','ö',"Ö",'ş',"Ş",'ü',"Ü"] #,"I"

def find_turkish_wiki(allLinks):
    
    flag = False # Control flag for the search
    
    turkish_link = allLinks[-1] #initialize with a random link
    
    for link in allLinks:
        
        if flag:
            break
        
        try: 
            
            if link['href'].find("/wiki/") == -1: # find links to other wikis
                continue
                
            link_title = link['title'] # find links that have a title
            
            
            """if title.split(":")[0] in ["Kategori","Kategoriler","Vikipedi","Dosya",
                                       "Tartışma","Anasayfa","Özel","Şablon"]: #Skip such wikis"""
            #(".png" in link_title) or (".jpg" in link_title) or (".svg" in link_title): #Skip such wikis
            
            if (":" in link_title) or ('.' in link_title):                     
                continue 
                
            for char in turkish_char: # check if the title includes a Turkish Character

                if char in link_title:
                    turkish_link = link
                    flag = True
                    break
                    
        except:
            pass
        
        
        
            
    return turkish_link,flag

def get_paragraphs(soup):
    
    paragraphs = soup.find(id="bodyContent").find_all('p') # find all paragraphs

    paragraph_texts = []
    for p in paragraphs:
        cleaned_text = p.text.strip("\n")
        if cleaned_text: # if the paragraph is not empty
            paragraph_texts.append(cleaned_text)
            
    return paragraph_texts

In [None]:
url = "https://tr.wikipedia.org/wiki/%C4%B0mroz_Deniz_Muharebesi_(1918)"

response = requests.get(url=url)

soup = BeautifulSoup(response.content, 'html.parser')

title = soup.find(id="firstHeading")
print(title.text)

allLinks = soup.find(id="bodyContent").find_all("a")
random.shuffle(allLinks)
linkToScrape = 0

In [None]:
def scrapeWikiArticle(url,text_list=[],counter=0,max_count=10): 
    
    if counter != max_count:
            
        response = requests.get(url=url)

        soup = BeautifulSoup(response.content, 'html.parser')

        title = soup.find(id="firstHeading")
        title_text = title.text.strip("\n")
        print(title_text)

        allLinks = soup.find(id="bodyContent").find_all("a")
        random.shuffle(allLinks)
        
        linkToScrape = find_turkish_link(allLinks)
        paragraph_texts = get_paragraphs(soup)
        
        text_list.append(paragraph_texts)    

        counter += 1
        scrapeWikiArticle("https://tr.wikipedia.org" + linkToScrape['href'],counter,max_count)
