In [1]:
import re
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
def format_cat_name(cat_name): 
    cat_name = re.sub('\s','_', cat_name)
    return cat_name

In [4]:
def format_cat_name(cat_name): 
    cat_name = re.sub('\s','_', cat_name)
    return cat_name
def get_content_df(cat_name):
    params = {'action':'query',
          'titles':format_cat_name(cat_name),
          'prop':'extracts',
          'rvprop': 'content',
          'format':'json'}
    
    response = requests.get('http://en.wikipedia.org/w/api.php?', params=params)
    data = response.json()
    return_data = data['query']['pages']
    
    page_id = list(return_data.keys())[0]
    content = return_data[page_id]['extract']
    soup = BeautifulSoup(content,"html5lib")
    
    temp_str=str()
    for string in soup.stripped_strings:
        temp_str += string 
    
    
    #clean = str(temp_list)
    
    title = format_cat_name(cat_name)
    
    content_df = pd.DataFrame([page_id, title, temp_str],index=(['page_id', 'title', 'content'])).T
    
    
    return content_df

In [6]:
bs_content_df = pd.read_pickle("bs_content_df")

In [7]:
bs_content_df.sample(5)

Unnamed: 0,page_id,title,content
0,44674463,RamBase,RamBaseis a Norwegian fully integrated cloud E...
0,34291637,XCOM:_Enemy_Unknown,XCOM: Enemy Unknownis a turn-based tactical vi...
0,26762608,Scientific_workflow_system,Ascientific workflow systemis a specialized fo...
0,327999,SASI_(software),SASI (Schools Administrative Student Informati...
0,20217224,Tropico_3,Tropico 3is a video game developed by Haemimon...


In [8]:
#Prepare TFIDF Term Frequency * inverse Document Frequency

bs_tfidf_vectorizer = TfidfVectorizer(min_df = 1, stop_words = 'english')

bs_tfidf_term_matrix_sps = bs_tfidf_vectorizer.fit_transform(bs_content_df.content)

bs_tfidf_term_matrix_df = pd.DataFrame(bs_tfidf_term_matrix_sps.toarray(),
                                       index=bs_content_df.content,
                                       columns=bs_tfidf_vectorizer.get_feature_names())

In [8]:
alpha_go = get_content_df('AlphaGo versus Lee Sedol')

alpha_go_str = str(alpha_go['content'].values).split('.')[:11]

alpha_go_df = pd.DataFrame(data = [x for x in alpha_go_str])

alpha_go_df[0][0]='AlphaGo versus Lee Sedol, orGoogle DeepMind Challenge Match, was a five-game Go match between 18-time world champion Lee Sedol and AlphaGo, a computer Go program developed by Google DeepMind, played in Seoul, South Korea between 9 and 15 March 2016'

alpha_go_df.columns=['content']

alpha_go_df.content

0     AlphaGo versus Lee Sedol, orGoogle DeepMind Ch...
1      AlphaGo won all but the fourth game; all game...
2      The match has been compared with the historic...
3     The winner of the match was slated to win $1 m...
4      Since AlphaGo won, Google DeepMind stated tha...
5      Lee received $170,000 ($150,000 for participa...
6     After the match, The Korea Baduk Association a...
7      It was given in recognition of AlphaGo\'s "si...
8      This match was chosen byScienceas one of the ...
9     BackgroundDifficult challenge in artificial in...
10     It has long been considered a difficult chall...
Name: content, dtype: object

In [9]:
alpha_go_df.to_pickle('alpha_go.pkl')

In [2]:
alpha_go_df = pd.read_pickle('alpha_go.pkl')

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(min_df = 1, stop_words = 'english')

ag_tfidf_term_matrix_sps = tfidf_vectorizer.fit_transform(alpha_go_df.content)

ag_tfidf_term_matrix_df = pd.DataFrame(ag_tfidf_term_matrix_sps.toarray(),
                                       index=alpha_go_df.content,
                                       columns=tfidf_vectorizer.get_feature_names())

In [4]:
ag_random_search_df = ag_tfidf_term_matrix_df.sample()

In [9]:
bs_with_search_term = bs_tfidf_term_matrix_df.append(ag_random_search_df)

In [None]:
n_components = 50
SVD = TruncatedSVD(n_components)
bs_component_names = ["component_"+str(i+1) for i in range(n_components)]

bs_svd_matrix = SVD.fit_transform(bs_with_search_term)

bs_svd_df = pd.DataFrame(bs_svd_matrix, 
                      index=bs_with_search_term.index, 
                      columns=component_names)

In [None]:
bs_search_term_svd_vector = bs_svd_df.loc[ag_random_search_df.index]
bs_search_term_svd_vector[:5]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

bs_svd_df['cosine_sim'] = cosine_similarity(bs_svd_df, bs_search_term_svd_vector)

ml_svd_df[['cosine_sim']].sort_values('cosine_sim', ascending=False).head(6)