# Wikipedia Article Recommendation system

## Data:
Using the whole dataset of wikipedia is not possible, due to hardware limitations. We are using the wikipedia API to download random wikipedia article titles and their content

### Importing the useful libraries

In [1]:
import os
import pickle
import wikipedia
import numpy as np
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer as tfidf_vectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings

In [2]:
import random

In [3]:
warnings.filterwarnings('ignore') # warning suppression

### Loading the names of random wikipedia articles

In [4]:
def wiki_titles(num_articles=None,mode='r',data_folder="./data",sample_links = True):
    '''
    read/write num_articles randomly called from wikipedia API, with their 10 sampled page links 
    '''
    if mode == 'w':
        if not os.path.exists(data_folder):
            os.makedirs(data_folder)
        titles = wikipedia.random(num_articles)
        with open(data_folder+'/random_articles_titles.txt', 'a+',encoding='utf-8') as file:
            for i in titles:
                file.write(i+'\n')
            if sample_links:
                for i in titles:
                    try:
                        for j in random.sample(wikipedia.page(i).links,10):
                            file.write(j+'\n')
                    except:
                        pass
            file.close()
    elif mode == 'r':
        with open (data_folder+'/random_articles_titles.txt','r+',encoding='utf-8') as file:
            if num_articles:
                from itertools import islice
                data = list(islice(file, num_articles))
            else:
                data = file.readlines()
            file.close()
        return data
    else:
        print("Mode can be either 'r' or 'w'")

In [5]:
wiki_titles(10,'w')

In [6]:
def run_circuits(n,x):
    '''
    run wiki_titles n times, in w mode, downloading x number of articles each time
    '''
    for i in range(n):
        wiki_titles(num_articles=x, mode='w')
        print('done: ',i)

In [None]:
run_circuits(10,1000)

done:  0


In [7]:
len(wiki_titles(None,'r'))

34386

### Get the summaries, categories of  each page

In [8]:
def summary(num_articles=None,mode = 'r',data_folder="./data"):
    if mode =='w':
        titles_content_map = dict()
        all_titles = wiki_titles(num_articles,mode = 'r')
        with open (data_folder+'/wikisummary.txt','a+',encoding='utf-8') as file:
            counter = 0
            for title_num in range(len(all_titles)):  
                title = all_titles[title_num].replace("\n","")
                try:
                    summary_text = wikipedia.summary(title)
                    summary_text_processed = summary_text.replace("\n","").replace("=","").replace("/","").replace("  "," ").replace(',','').replace(":","")
                    cat = ''
                    try:
                        page = wikipedia.page(title)
                        for i in page.categories:
                            cat = cat+' '+str(i)
                        cat = cat.replace("\n","").replace("=","").replace("/","").replace("  "," ").replace(',','').replace(":","")
                    except:
                        print('ABC')
                        #continue
                    file.write(title+','+summary_text_processed+','+cat+ '\n')
                except:
                    continue
                counter+=1
                if counter%50==0: 
                    print("{} articles have been loaded!".format(counter))  
                    print("most recent sample title is:",title)
                    print("="*15)
        file.close()
    elif mode == 'r':
        with open (data_folder+'/wikisummary.txt','r',encoding='utf-8') as file:
            if num_articles:
                from itertools import islice
                data = list(islice(file, num_articles))
            else:
                data = file.readlines()
            file.close()
            return data
    else:
        print('Enter a valid mode')
            

In [10]:
summary(None,'w')

50 articles have been loaded!
most recent sample title is: Montevideo
100 articles have been loaded!
most recent sample title is: Spread of Islam in Indonesia
150 articles have been loaded!
most recent sample title is: Self-portrait
200 articles have been loaded!
most recent sample title is: Percival Griffiths
250 articles have been loaded!
most recent sample title is: Cerconota ptilosema
300 articles have been loaded!
most recent sample title is: Livid (film)
350 articles have been loaded!
most recent sample title is: Stalling (surname)
400 articles have been loaded!
most recent sample title is: Poudenx
450 articles have been loaded!
most recent sample title is: Donvidas
500 articles have been loaded!
most recent sample title is: Fender Prosonic
550 articles have been loaded!
most recent sample title is: Lost in Worship
600 articles have been loaded!
most recent sample title is: Arthur Christian
650 articles have been loaded!
most recent sample title is: Simó de Guardiola y Hortoneda


ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
2850 articles have been loaded!
most recent sample title is: Taxonomy (biology)
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
2900 articles have been loaded!
most recent sample title is: Interim Register of Marine and Nonmarine Genera
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
2950 articles have been loaded!
most recent sample title is: Geographic coordinate system
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
3000 articles hav

ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
4250 articles have been loaded!
most recent sample title is: ISBN (identifier)
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
4300 articles have been loaded!
most recent sample title is: National Center for Biotechnology Information
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
4350 articles have been loaded!
most recent sample title is: Taxonomy (biology)
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
ABC
AB

In [9]:
def summary(num_articles=None,mode = 'r',data_folder="./data"):
    if mode =='w':
        titles_content_map = dict()
        all_titles = wiki_titles(num_articles,mode = 'r')
        with open (data_folder+'/wikisummary.txt','a+',encoding='utf-8') as file:
            counter = 0
            for title_num in range(len(all_titles)):  
                title = all_titles[title_num].replace("\n","")
                try:
                    summary_text = wikipedia.summary(title)
                    summary_text_processed = summary_text.replace("\n","").replace("=","").replace("/","").replace("  "," ").replace(',','').replace(":","").replace('.','')
                    cat = ''
                    try:
                        page = wikipedia.page(title)
                        for i in page.categories:
                            cat = cat+' '+str(i)
                        cat = cat.replace("\n","").replace("=","").replace("/","").replace("  "," ").replace(',','').replace(":","").replace('.','')
                    ## improve this by using " ".join(cat)
                    except:
                        print('ABC')
                        #continue
                    file.write(title+','+summary_text_processed+','+cat+ '\n')
                    
                except:
                    continue
                counter+=1
                if counter%50==0: 
                    print("{} articles have been loaded!".format(counter))  
                    print("most recent sample title is:",title)
                    print("="*15)
        file.close()
    elif mode == 'r':
        import pandas as pd
        data = pd.read_csv(data_folder+'/wikisummary.txt',delimiter=',',header = None,names=['Title','Summary','Categories'],nrows = num_articles,encoding = 'utf-8')
        data.drop_duplicates(inplace = True)
        return data

    else:
        print('Enter a valid mode')

### Read the data, and extracting features

In [11]:
data = summary(None, mode = 'r')

In [17]:
data.head(10)

Unnamed: 0,Title,Summary,Categories
0,2017 in Iceland,Events in the year 2017 in Iceland.,2010s in Iceland 2017 by country 2017 in Euro...
1,Zapaleri,Zapaleri Lake is a lake in the Sur Lípez Provi...,All stub articles Articles using infobox body...
2,Garçon Wines,Garçon Wines is a British manufacturer of wine...,Articles with short description Bottling comp...
3,Mount David (Maine),Mount David (also known as Mount Davis Davis M...,All articles needing coordinates Bates Colleg...
4,Venerupis decussata,Venerupis decussata is a marine bivalve mollus...,Articles with 'species' microformats Articles...
5,Cyclic-guanylate-specific phosphodiesterase,Cyclic-guanylate-specific phosphodiesterase (E...,EC 3.1.4
6,FC Neded,FC Neded are a semi-professional football club...,Articles with Hungarian-language sources (hu)...
7,The Magic Cloak of Oz,The Magic Cloak of Oz is a 1914 film directed ...,1910s fantasy films 1914 films American black...
8,Angoori rasmalai,Angoori rasmalai is an Indian dessert and a ty...,All articles lacking sources All stub article...
9,Accuracy International,Accuracy International is a British firearms m...,All articles needing additional references Al...


In [21]:
data = data.dropna()

In [22]:
data.shape

(2424, 3)

In [23]:
all_titles = data['Title'].tolist()
all_titles_content = (data['Summary']+' '+data['Categories']).tolist()

In [16]:
all_titles_content

['Events in the year 2017 in Iceland.  2010s in Iceland 2017 by country 2017 in Europe 2017 in Iceland Articles with short description CS1 Icelandic-language sources (is) Short description is different from Wikidata Years of the 21st century in Iceland',
 'Zapaleri Lake is a lake in the Sur Lípez Province Potosí Department Bolivia. At an elevation of 4608 m its surface area is 2 km².  All stub articles Articles using infobox body of water without image Articles using infobox body of water without image bathymetry Coordinates on Wikidata Lakes of Potosí Department Potosí Department geography stubs',
 'Garçon Wines is a British manufacturer of wine bottles and related packaging. Founded in 2016 the company produces a wine bottle that with its packaging can fit through a letter box opening.  Articles with short description Bottling companies Commons category link from Wikidata Official website not in Wikidata Packaging companies of the United Kingdom Short description matches Wikidata Win

In [25]:
num_tfidf_features        = 1000        # number of tfidf features
input_article_link        = "https://en.wikipedia.org/wiki/Donald_Trump"  # input article for recommendation
num_recommended_articles  = 10
n_gram_range              = (1,5)

In [26]:
vectorizer = tfidf_vectorizer(input=all_titles_content, lowercase=True, 
                              stop_words="english", ngram_range=n_gram_range ,max_features=num_tfidf_features)

In [27]:
all_titles_contents_matrix = vectorizer.fit_transform(all_titles_content)

In [28]:
print("number of random article titles after processing:", len(all_titles))
print("number of tfidf vectorized elements:", len(vectorizer.get_feature_names()))

number of random article titles after processing: 2424
number of tfidf vectorized elements: 1000


In [29]:
sim_unigram=cosine_similarity(all_titles_contents_matrix)

In [None]:
sim_unigram

In [30]:
model_folder = './wiki_model'

In [None]:
if not os.path.exists(model_folder):
            os.makedirs(model_folder)

In [None]:
with open(model_folder+'/wiki_cos.pkl', 'ab') as file:
            pickle.dump(sim_unigram, file)

In [None]:
with open(model_folder+'/vectorizer.pkl', 'ab') as file:
            pickle.dump(vectorizer, file)

In [None]:
with open(model_folder+'/vectorizer.pkl', 'rb') as file:
            vec = pickle.load(file)

In [None]:
mat

### Taking the input

In [73]:
input_article_link        = "https://en.wikipedia.org/wiki/Donald_Trump"

In [60]:
#input_article_link = 'https://en.wikipedia.org/wiki/Peacock'

In [74]:
# if input is link:
wiki_article_link_format = input_article_link.split("/").copy()
wiki_article_link_format

['https:', '', 'en.wikipedia.org', 'wiki', 'Donald_Trump']

In [75]:
input_article_title = input_article_link.split('/')[-1]
input_article_title

'Donald_Trump'

In [76]:
print("="*100, "\nThe title of the input article is: {}".format(input_article_title))
print("-"*100, "\nThe link for the input article is:{}".format(input_article_link))


The title of the input article is: Donald_Trump
---------------------------------------------------------------------------------------------------- 
The link for the input article is:https://en.wikipedia.org/wiki/Donald_Trump


In [77]:
summary_text = wikipedia.summary(input_article_title)
summary_text_processed = summary_text.replace("\n","").replace("=","").replace("/","").replace("  "," ").replace(',','').replace(":","").replace('.',' ')

In [78]:
categories = " ".join(wikipedia.page(input_article_title).categories)

In [79]:
categories = categories.replace("\n","").replace("=","").replace("/","").replace("  "," ").replace(',','').replace(":","").replace('.',' ')


In [80]:
text = summary_text_processed+categories

In [81]:
text_vec = vectorizer.transform([text])
text_decoded = [vectorizer.get_feature_names()[i] 
                                  for i in range(len(vectorizer.get_feature_names())) 
                                  if text_vec.todense()[0,i]!=0.0]

In [82]:
cos_sim = cosine_similarity(text_vec,all_titles_contents_matrix)
rec_titles_elems = cos_sim.argsort()[0][-num_recommended_articles:][::-1]
cos_sim.sort()
rec_titles_cos_sim = cos_sim[0][-num_recommended_articles:][::-1]
rec_titles = [all_titles[elem] for elem in rec_titles_elems]
print("="*100)
print("Titles of recommended articles: \n\n{}".format(rec_titles))

Titles of recommended articles: 

['Barack Obama', 'Paul Clancy', 'United States', 'Utah', 'United States of America', 'OCLC (identifier)', 'John Lawrence LeConte', 'United States Geological Survey', 'Smithsonian Institution', 'Abbot Low Moffat']


In [83]:
rec_titles

['Barack Obama',
 'Paul Clancy',
 'United States',
 'Utah',
 'United States of America',
 'OCLC (identifier)',
 'John Lawrence LeConte',
 'United States Geological Survey',
 'Smithsonian Institution',
 'Abbot Low Moffat']