In [27]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

%matplotlib inline

In [16]:
from bs4 import BeautifulSoup

In [3]:
wiki_data = pd.read_pickle('all_pages_df.p')

In [5]:
wiki_data.sample(10)

Unnamed: 0,_id,extract,ns,pageid,title
2637,5a2192f7e15d6e00515f3a51,<p><b>UbiCare</b> provides a platform for rese...,0,31223008,UbiCare
4947,5a2195b9e15d6e00515f4357,<p><i><b>Port Royale 3: Pirates &amp; Merchant...,0,35535478,Port Royale 3: Pirates & Merchants
3967,5a219490e15d6e00515f3f83,<p>The <b>Pattern Oriented Rule Implementation...,0,13339949,Pattern Oriented Rule Implementation
1653,5a2191c7e15d6e00515f3679,<p><b>Canopy Labs</b> is a customer analytics ...,0,40158733,Canopy Labs
1857,5a219206e15d6e00515f3745,<p><b>Sunopsis</b> is a software company based...,0,3765816,Sunopsis
4483,5a21952de15d6e00515f4187,<p><b>Newscycle</b> is an American software de...,0,39004244,Newscycle Solutions
6047,5a219705e15d6e00515f47a3,<p><b>Bizagi</b> is a privately owned software...,0,31549598,Bizagi
32,5a218fcbe15d6e00515f3024,<p>The <b>Center for Biological &amp; Computat...,0,17114678,CBCL (MIT)
3221,5a2193ace15d6e00515f3c99,<p>The <b>Financial Information eXchange</b> (...,0,734281,Financial Information eXchange
1533,5a2191a2e15d6e00515f3601,<p><b>Structured k-Nearest Neighbours</b> i...,0,53985910,Structured kNN


In [17]:
def cleaner(text):

    text = BeautifulSoup(text, "html5lib").get_text()
    text = re.sub('&#39;',' ',text).lower()
    text = re.sub('<br />',' ',text)
    text = re.sub('<.*>.*</.*>',' ', text)
    text = re.sub('[\d]',' ',text)
    text = re.sub('[0-9]',' ',text)
    text = re.sub('[^a-z ]',' ',text)
    text = re.sub('\\ufeff', ' ', text)
    text = re.sub('[^a-z ]',' ',text)
    text = ' '.join(text.split())
    
    return text

In [18]:
wiki_data['clean_text'] = wiki_data['extract'].apply(cleaner)

In [20]:
wiki_data[['clean_text', 'extract', 'title']].head()

Unnamed: 0,clean_text,extract,title
0,data exploration is an approach similar to ini...,<p><b>Data exploration</b> is an approach simi...,Data exploration
1,these datasets are used for machine learning r...,<p>These datasets are used for machine-learnin...,List of datasets for machine learning research
2,machine learning is a field of computer scienc...,<p><b>Machine learning</b> is a field of compu...,Machine learning
3,the following outline is provided as an overvi...,<p>The following outline is provided as an ove...,Outline of machine learning
4,the accuracy paradox for predictive analytics ...,<p>The <b>accuracy paradox</b> for predictive ...,Accuracy paradox


In [22]:
!pip install wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

Collecting wordcloud
  Downloading wordcloud-1.3.1.tar.gz (169kB)
[K    100% |████████████████████████████████| 174kB 2.8MB/s ta 0:00:01
Building wheels for collected packages: wordcloud
  Running setup.py bdist_wheel for wordcloud ... [?25ldone
[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/d9/4c/ac/e63c45f2ce09860e9459a410953039c30296e89d9f7234675f
Successfully built wordcloud
Installing collected packages: wordcloud
Successfully installed wordcloud-1.3.1


In [41]:
naive_vectorizer = CountVectorizer(min_df = 5, stop_words = 'english')

In [44]:
document_term_matrix_sps = naive_vectorizer.fit_transform(wiki_data['clean_text'])

In [45]:
document_term_matrix_sps.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ..., 
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 2, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [47]:
document_term_matrix_df = pd.DataFrame(document_term_matrix_sps.toarray(),
                                       index=wiki_data.index,
                                       columns=naive_vectorizer.get_feature_names())

In [48]:
document_term_matrix_df

Unnamed: 0,aa,aaa,aaai,aaas,aachen,aai,aall,aalst,aaron,aarp,...,zoubin,zquez,zseries,zu,zuben,zulip,zur,zurich,zusammenarbeit,zx
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
SVD = TruncatedSVD(n_components=400)
latent_semantic_analysis = SVD.fit_transform(document_term_matrix_sps)

In [50]:
search_term = "machine learning"

In [52]:
search_term_vec = naive_vectorizer.transform([search_term])

In [53]:
search_term_lsa = SVD.transform(search_term_vec)

In [54]:
cos_sim = cosine_similarity(search_term_lsa, latent_semantic_analysis)


In [55]:
top_five_scores = cos_sim[0].argsort()[:-6:-1]

In [56]:
top_five_scores

array([   2,  112,    3,  183, 1233])

In [58]:
top_five_df = wiki_data.iloc[top_five_scores]
top_five_df[['pageid', 'title','clean_text']]


Unnamed: 0,pageid,title,clean_text
2,233488,Machine learning,machine learning is a field of computer scienc...
112,5721403,Machine Learning (journal),machine learning is a peer reviewed scientific...
3,53587467,Outline of machine learning,the following outline is provided as an overvi...
183,50828755,Timeline of machine learning,this page is a timeline of machine learning ma...
1233,27141248,Error-driven learning,error driven learning is a sub area of machine...
