In [1]:
import re
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from pymongo import MongoClient
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
#Function to convert category name to wiki API friendly
def format_cat_name(cat_name): 
    cat_name = re.sub('\s','_', cat_name)
    return cat_name

In [3]:
#fucntion to build up the query to search for category in wiki api
def go_query(cat_name):
    cate_name = format_cat_name(cat_name)
    params = {"action": "query",
            "format": "json",
            "list": "categorymembers",
            "cmtitle": cate_name,
            "cmlimit": "max"}
    query = requests.get("http://en.wikipedia.org/w/api.php?",params=params)
    
    return query.json()

In [4]:
#creating dataframe from json format obtained from wiki api
def json_df(cat_name):
    temp_dict = go_query(cat_name)
    df = pd.DataFrame(temp_dict['query']['categorymembers'])
    return df

In [5]:
#function to obtain pages in passed category name with three recursive to get to sub categories
def cat_pages(cat_name, max_depth=3):
    
    params = {'action':'query',
          'format':'json',
          'list':'categorymembers',
          'cmtitle': format_cat_name(cat_name),
          'cmlimit':'max'}
    
    
    response = requests.get('http://en.wikipedia.org/w/api.php?', params=params)
    data = response.json()
    
    members = data['query']['categorymembers']

    pages = list(filter(lambda x: x['ns'] == 0, members))
    subpages = list(filter(lambda x: x['ns'] == 14, members))
      
    while max_depth > 0:
 
        if not subpages:    
            return pages
    
        else:
            max_depth -= 1
            for subpage in subpages:
                pages += cat_pages(subpage['title'], max_depth)
    
    return pages

In [6]:
#Creating a list of page names
def page_list(cat_name):
    pages = cat_pages(format_cat_name(cat_name))
    pages_df = pd.DataFrame(pages)
    page_list = list(pages_df['title'])
    return page_list

In [7]:
#obtaining contents of the page by passing a category name, return the content not dataframe
def get_content(cat_name):
    params = {'action':'query',
          'titles':format_cat_name(cat_name),
          'prop':'extracts',
          'rvprop': 'content',
          'format':'json'}
    
    response = requests.get('http://en.wikipedia.org/w/api.php?', params=params)
    data = response.json()
    return_data = data['query']['pages']
    
    page_id = list(return_data.keys())[0]
    content = return_data[page_id]['extract']
    title = format_cat_name(cat_name)
    
    content_df = pd.DataFrame([page_id, title, content],index=(['page_id', 'title', 'content'])).T
    
    return content

In [8]:
#Function to create Dataframe with page_id, title, content by sentences
def get_content_df_sen(cat_name):
    params = {'action':'query',
          'titles':format_cat_name(cat_name),
          'prop':'extracts',
          'rvprop': 'content',
          'format':'json'}
    
    response = requests.get('http://en.wikipedia.org/w/api.php?', params=params)
    data = response.json()
    return_data = data['query']['pages']
    
    page_id = list(return_data.keys())[0]
    content = return_data[page_id]['extract']
    soup = BeautifulSoup(content,"html5lib")
    
    temp_list=[]
    for string in soup.stripped_strings:
        temp_list.append(string)
    
    #s=''
    #clean = s.join(temp_list)
    
    
    
    title = format_cat_name(cat_name)
    
    content_df = pd.DataFrame([page_id, title, clean],index=(['page_id', 'title', 'content'])).T
    
    return content_df

In [9]:
#clean up the syntex of content by BeautifulSoup
def clean_content(cat_name):
    page = get_content(cat_name) 
    soup = BeautifulSoup(page, "html5lib")
    temp_list=[]
    for string in soup.stripped_strings:
        temp_list.append(string)
    
    
    s=''
    clean = s.join(temp_list)

    return clean 

In [10]:
def get_content_df(cat_name):
    params = {'action':'query',
          'titles':format_cat_name(cat_name),
          'prop':'extracts',
          'rvprop': 'content',
          'format':'json'}
    
    response = requests.get('http://en.wikipedia.org/w/api.php?', params=params)
    data = response.json()
    return_data = data['query']['pages']
    
    page_id = list(return_data.keys())[0]
    content = return_data[page_id]['extract']
    soup = BeautifulSoup(content,"html5lib")
    
    temp_str=str()
    for string in soup.stripped_strings:
        temp_str += string 
    
    
    #clean = str(temp_list)
    
    title = format_cat_name(cat_name)
    
    content_df = pd.DataFrame([page_id, title, temp_str],index=(['page_id', 'title', 'content'])).T
    
    
    return content_df

In [11]:
#Mongo client IP
client = MongoClient('54.190.53.213', 27016)

In [12]:
#Wiki Mongo reference for Machine Learning content by sentences
db_ref = client.my_database
db_wiki_ref = db_ref.my_wikipedia

In [13]:
#Wiki Mongo reference for Business Software content by sentences
db_wiki_bs_ref = db_ref.my_wiki_bs

In [14]:
#Wiki Mongo reference for Machie Learning content whole text
db_wiki_whole_ref = db_ref.my_wikipedia_all

In [56]:
#Wiki Mongo reference for Business Software content whole text
db_wiki_bs_whole_ref = db_ref.my_wiki_bs_all

In [15]:
#Check names of the Mongo Database Structure
client.database_names(), db_ref.collection_names()

(['admin', 'local', 'my_database', 'test'],
 ['my_collection',
  'my_wikipedia_all',
  'my_wikipedia',
  'my_wiki_bs',
  'my_wiki_bs_all'])

In [25]:
#Wiki Collection Process for Machine Learning Categories

#Create list with set so duplicates are gone
ml_page_list = set(page_list("Category:Machine learning"))

#Store cleaned contents in the list 
content_list=[]
for title in ml_page_list:
    content_list.append(clean_content(title))

#Replcae '.' with space since Mongo deosn't like keys with periods 
ml_page_list_2 = [x.replace('.',' ') for x in ml_page_list]

#Create a list of dictionaries of Title:content
new_list = []
for i in range(len(ml_page_list_2)):
    new_dict = {ml_page_list_2[i]:content_list[i]}
    new_list.append(new_dict)

#Store list of dictionaries to Mongo
for i in new_list:
    db_wiki_ref.insert_one(i)

In [17]:
#retireving list of dictionaries 
ml_dict = list(db_wiki_ref.find())

In [18]:
#Show first five items of the dictionary
ml_dict[:5]

[{'Melomics': ['Melomics',
   '(derived from "genomics of melodies") is a computational system for the automatic composition of music (with no human intervention), based on bioinspired algorithms.',
   'Technological aspects',
   'Melomics applies an evolutionary approach to music composition, i.e., music pieces are obtained by simulated evolution. These themes compete to better adapt to a proper fitness function, generally grounded on formal and aesthetic criteria. The Melomics system encodes each theme in a genome, and the entire population of music pieces undergoes evo-devo dynamics (i.e., pieces read-out mimicking a complex embryological development process). The system is fully autonomous: once programmed, it composes music without human intervention.',
   "This technology has been transferred to industry as an academic spin-off, Melomics Media, which has provided and reprogrammed a new computer cluster that created a huge collection of popular music. The results of this evolution

In [19]:
#counts for Machine learing pages storage for contents by sentence, content text as a whole 
db_wiki_ref.count(), db_wiki_whole_ref.count()

(1106, 1106)

In [None]:
#Create Data Frame with content and its page title for all Machine Learning Categories & Sub categories
for x in list(ml_page_list):
    temp_df = get_content_df(x)
    ml_content_df = ml_content_df.append(temp_df)

In [102]:
#Pickle storage for data frame 
ml_content_df.to_pickle("ml_content_df.pkl")

In [15]:
ml_content_df = pd.read_pickle("ml_content_df.pkl")

In [16]:
#numberic label for title
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
ml_content_df['title_num'] = le.fit_transform(ml_content_df['title'])

In [17]:
ml_content_df.sample(3)

Unnamed: 0,page_id,title,content,title_num
0,9292749,Forward–backward_algorithm,Theforward–backward algorithmis an inference a...,365
0,47012074,Neural_Designer,Neural Designeris a software tool for data ana...,726
0,22562715,Clustering_high-dimensional_data,Clustering high-dimensional datais the cluster...,151


In [20]:
#Prepare TFIDF Term Frequency * inverse Document Frequency

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(min_df = 1, stop_words = 'english')

ml_tfidf_term_matrix_sps = tfidf_vectorizer.fit_transform(ml_content_df.content)

ml_tfidf_term_matrix_df = pd.DataFrame(ml_tfidf_term_matrix_sps.toarray(),
                                       index=ml_content_df.content,
                                       columns=tfidf_vectorizer.get_feature_names())

In [34]:
#Search Material: First 11 Sentences of the First Paragraph of "AlphaGo versus Lee Sedol" Wiki Page

alpha_go = get_content_df('AlphaGo versus Lee Sedol')

alpha_go_str = str(alpha_go['content'].values).split('.')[:11]

alpha_go_df = pd.DataFrame(data = [x for x in alpha_go_str])

alpha_go_df[0][0]='AlphaGo versus Lee Sedol, orGoogle DeepMind Challenge Match, was a five-game Go match between 18-time world champion Lee Sedol and AlphaGo, a computer Go program developed by Google DeepMind, played in Seoul, South Korea between 9 and 15 March 2016'

alpha_go_df.columns=['content']

alpha_go_df.content

0     AlphaGo versus Lee Sedol, orGoogle DeepMind Ch...
1      AlphaGo won all but the fourth game; all game...
2      The match has been compared with the historic...
3     The winner of the match was slated to win $1 m...
4      Since AlphaGo won, Google DeepMind stated tha...
5      Lee received $170,000 ($150,000 for participa...
6     After the match, The Korea Baduk Association a...
7      It was given in recognition of AlphaGo\'s "si...
8      This match was chosen byScienceas one of the ...
9     BackgroundDifficult challenge in artificial in...
10     It has long been considered a difficult chall...
Name: content, dtype: object

In [35]:
#Create TFIDF for search sentences 

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(min_df = 1, stop_words = 'english')

ag_tfidf_term_matrix_sps = tfidf_vectorizer.fit_transform(alpha_go_df.content)

ag_tfidf_term_matrix_df = pd.DataFrame(ag_tfidf_term_matrix_sps.toarray(),
                                       index=alpha_go_df.content,
                                       columns=tfidf_vectorizer.get_feature_names())

In [37]:
#Get a random sentence from "AlphaGo versus Lee Sedol" page. 

ag_random_search_df = ag_tfidf_term_matrix_df.sample()
ag_random_search_df

Unnamed: 0_level_0,000,15,150,170,18,1997,20,2016,22,additional,...,thinking,time,unicef,versus,win,winner,winning,won,world,year
content,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The winner of the match was slated to win $1 million,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.478484,0.478484,0.0,0.0,0.0,0.0


In [39]:
#Combine Search sentence to the machine learning TFIDF
ml_with_search_term = ml_tfidf_term_matrix_df.append(ag_random_search_df)

In [42]:
ml_with_search_term.fillna(value = 0.0, inplace=True)
pass 

In [46]:
ml_with_search_term.isnull().values.any()

False

In [49]:
#Compute SVD of Augmented Document Term Matrix 
from sklearn.decomposition import TruncatedSVD

n_components = 50
SVD = TruncatedSVD(n_components)
ml_component_names = ["component_"+str(i+1) for i in range(n_components)]

ml_svd_matrix = SVD.fit_transform(ml_with_search_term)

ml_svd_df = pd.DataFrame(ml_svd_matrix, 
                      index=ml_with_search_term.index, 
                      columns=ml_component_names)

In [50]:
#Find our search sentence and indentify its topic ratio 
ml_search_term_svd_vector = ml_svd_df.loc[ag_random_search_df.index]
ml_search_term_svd_vector[:1]

Unnamed: 0_level_0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,...,component_41,component_42,component_43,component_44,component_45,component_46,component_47,component_48,component_49,component_50
content,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The winner of the match was slated to win $1 million,0.006572,0.014576,-0.008371,-0.000466,-0.0101,-0.011086,0.005374,-0.010181,-0.00992,0.001727,...,-0.049233,0.005342,-0.000349,-0.002035,-0.009223,-0.049225,5.8e-05,0.011011,-0.007566,0.020353


In [55]:
from sklearn.metrics.pairwise import cosine_similarity

ml_svd_df['cosine_sim'] = cosine_similarity(ml_svd_df, ml_search_term_svd_vector)

ml_svd_df[['cosine_sim']].sort_values('cosine_sim', ascending=False).head(6)

Unnamed: 0_level_0,cosine_sim
content,Unnamed: 1_level_1
The winner of the match was slated to win $1 million,1.0
"MysteryVibeis a British manufacturer of sex toys.HistoryMysteryVibe was founded by a group of researchers, engineers and designers. Inspired by trends in smartphones like Nokia Morph, the founders came up with the idea of creating a sex toy that would adapt to any body shape and vibrate to any pattern. They continued to research for a number of years before formally starting the company in May 2014, when they were incubated by London-based industrial design firm Seymourpowell.MysteryVibe released their iOS app on the Apple App Store in December 2015 and their Android app on Google Play in September 2016. The apps are designed without any adult themes to support MysteryVibe's wider goal of improving sex education for teenagers and are the only apps in their class to be rated 12+. Their apps have been downloaded more than 250,000 times since launch.MysteryVibe's flagship product, Crescendo, is the world's first vibrator that can be bent to adapt to any body shape. Crescendo was the first crowdfunding project to offer its backers 2 versions of their product:PilotandRetail. They ran what they called the #Pilot1000programme for their first 1,000 users to get feedback on their Crescendo product. The #Pilot1000users spanned 48 countries and included both backers and experts. MysteryVibe gave all 1,000 users full access to their founding CEO with direct email, phone and Skype. They then used the feedback they received to make the finalRetailCrescendo.InvestmentDue to the lean model adopted by MysteryVibe withcollectivesandcollaborations, they were able to build the company from a sketch to shipped products with less than £1m ($1.5m) in funding. They were also unique in raising 100% of the money from Angel investors without any recourse to Venture capital. As of Q2 2017, MysteryVibe has raised $3.5m in total funding.AwardsMysteryVibe has won numerous awards for their company, products and founders. Notable mentions are Red Dot, IDA Design, The Drum, and Excellence in Design. Their biggest recognition has been the Young Guns award.Virgin #VOOM2016In June 2016, MysteryVibe became the first pleasure product to be featured by Virgin in their #VOOM competition. They were showcased under the Export Awards category for exporting their products to over 50 countries worldwide. This led to their selection to the Hardware Club and a nomination for the 'Best Hardware Startup' award at The Europas in London.Media attentionMysteryVibe is the first brand in their category to have been featured on BBC. Ranked by European investors as No. 9 of the 100 Slush start-ups inCofounder Magazine, MysteryVibe has been named as one of the ""Top 100 Europe's hottest early-stage Founders"" byPathFounders,Europas. They have been listed at No. 7 in the ""12 days of start-ups: Spectacular businesses set for stardom in 2016"".References",0.810189
"Traxis a technology company headquartered in Singapore, with offices throughout APAC, Europe, Middle East, North America and South America. Its computer vision technology is used by FMCG companies such as Coca-Cola and Retailers to collect, measure and analyse what is happening on physical shelves.HistoryFounded in 2010, Trax has over 150 customers in the Retail and FMCG industries, including beverage giant Coca-Cola and brewer Anheuser-Busch InBev. Its service is available in 45 markets and the company's development centre is located in Tel Aviv. Trax closed its first round of funding for US$1.1 million, in June, 2011. They opened their Tel-Aviv office in July, 2012, and closed their second round of funding for US$6.4 million in December, 2012. Their third round of funding for US$15.7 million closed in February, 2014. In December 2014 Trax announced its fourth round of investment of US$15 million.In 2015, Trax opened their first two regional offices, London in January, and Brazil in April. In March 2016, Trax established their LATAM headquarters in Atlanta, Georgia. Trax announced a 5th round of funding for US$40 million on June 8, 2016. Two new regional offices were opened in Shanghai and Mexico City, in June and September 2016 respectively. On February 8, 2017, Trax closed their sixth round of funding for US$19.5 million. On June 30, 2017 Trax announced its most recent funding round of US$64 million lead by global private equity giant Warburg Pincus.Mergers and AcquisitionsOn July 12, 2017, Trax announced that they had acquired Nielsen Store Observation (NSO) assets in the USA from Nielsen Corporation.Software and ServicesTrax reduces the time an employee needs to spend on audits to check inventory, shelf display and product promotions. It is also gathers more extensive data such as product assortment, shelf space, pricing, promotions, shelf location and arrangement of products on display. This market intelligence is valuable to Retail and FMCG manufacturers because they pay large sums for space in supermarkets and stores. For example, in the US companies pay approximately $18 billion for shelf space.TechnologyThe computer vision technology uses Artificial Intelligence, fine-grained image recognition, and machine learning engines to convert store images into shelf insights. Trax is able to recognise products that are similar or identical such as branded drinks or shampoo bottles whilst also being able to differentiate between them based on variety and size. It piloted its machine learning algorithms with initial customers, allowing its algorithm to learn about different products. As the company processes more images, the better it gets at recognising the same products in different shapes and sizes.. To date, Trax has recognized more than 8 billion images, and recognizes approximately 400,000 million new products per month.ReferencesExternal LinksOfficial website",0.801146
"Qloo(pronounced ""clue"") is a company that uses artificial intelligence (AI). An application programming interface (API) provides cultural correlations. It was founded by Alex Elias and received funding from Leonardo DiCaprio, Barry Sternlicht and Pierre Lagrange.Qloo establishes consumer preference correlations via machine learning across multiple proprietary, customer and open-source data across cultural domains including music, film, television, dining, nightlife, fashion, books and travel. The recommender system uses AI to predict correlations for further applications.HistoryQloo was founded in 2012 by chief executive officer Alex Elias and chief operating officer Jay Alger. Elias was formerly a hedge fund manager with APE Capital. He graduated from the University of Southern California, and then developed his idea at law school at New York University. Alger was formerly the CEO of the digital agency Deepend.Qloo was tested on a private website in April 2012. In 2012, Qloo raised $1.4 million in seed funding from investors including Cedric the Entertainer, Danny Masterson, and venture capital firm Kindler Capital. Qloo had a public beta release in November 2012 after its initial funding.In 2013, the company raised an additional $1.6 million from Cross Creek Pictures founding partner Tommy Thompson, and Samih Toukan and Hussam Khoury, founders of Maktoob, an Internet services company purchased by Yahoo! for $164 million in 2009. On November 14, 2013, a website and an iPhone app were announced. The company later released an Android app, and tablet versions, in mid-2014.In 2016, Qloo secured $4.5 million in venture capital investment. The $4.5 million was split between a number of investors, including Barry Sternlicht, Pierre Lagrange and Leonardo DiCaprio. In July 2017, Qloo raised $6.5 million in funding rounds from AXA Strategic Ventures and Elton John.Following the investment, the founders stated in an interview with Tech Crunch that they would use the investment to expand Qloo's database. They hoped the move would secure larger contracts with corporate clients. At the time, clients already included Fortune 500 companies such as Twitter, PepsiCo and BMW.Services and featuresQloo calls itself a cultural AI platform to provide real-time correlation data across domains of culture and entertainment including: film, music, television, dining, nightlife, fashion, books and travel. Each category contains subcategories.Qloo’s knowledge of a user's taste in one category can be utilized to offer suggestions in other categories. Users then rate the suggestions, providing it with feedback for future suggestions. Qloo has partnerships with companies such as Expedia and iTunes.ReferencesExternal linksOfficial website",0.800099
"Cleverbotis a chatterbot web application that uses an artificial intelligence (AI) algorithm to have conversations with humans. It was created by British AI scientist Rollo Carpenter. It was preceded by Jabberwacky, a chatbot project that began in 1988 and went online in 1997. In its first decade, Cleverbot held several thousand conversations with Carpenter and his associates. Since launching on the web, the number of conversations held has exceeded 200 million. Besides the web application, Cleverbot is also available as an iOS, Android, and Windows Phone app.OperationUnlike some other chatterbots, Cleverbot's responses are not pre-programmed. Instead, it learns from human input: Humans type into the box below the Cleverbot logo and the system finds all keywords or an exact phrase matching the input. After searching through its saved conversations, it responds to the input by finding how a human responded to that input when it was asked, in part or in full, by Cleverbot.Cleverbot participated in a formal Turing test at the 2011 Techniche festival at the Indian Institute of Technology Guwahati on September 3, 2011. Out of the 334 votes cast, Cleverbot was judged to be 59.3% human, compared to the rating of 63.3% human achieved by human participants. A score of 50.05% or higher is often considered to be a passing grade. The software running for the event had to handle just 1 or 2 simultaneous requests, whereas online Cleverbot is usually talking to around 80,000 people at once.DevelopmentsCleverbot is constantly learning, growing in data size at a rate of 4 to 7 million interactions per second. Updates to the software have been mostly behind the scenes. In 2014, Cleverbot was upgraded to use GPU serving techniques. The program chooses how to respond to users fuzzily, the whole of the conversation being compared to the millions that have taken place before. Cleverbot now uses over 279 million interactions, about 3-4% of the data it has already accumulated. The developers of Cleverbot are attempting to build a new version using machine learning techniques.A significant part of the engine behind Cleverbot and an API for accessing it has been made available to developers in the form of Cleverscript. A service for directly accessing Cleverbot has been made available to developers in the form of Cleverbot.io.An app that uses the Cleverscript engine to play a game of 20 Questions, has been launched under the nameClevernator. Unlike other such games, the player asks the questions and it is the role of the AI to understand, and answer factually. An app that allows owners to create and talk to their own small Cleverbot-like AI has been launched, calledCleverme!for Apple products.In early 2017, a Twitch stream of two Google Home devices modified to talk to each other using Cleverbot.io garnered over 700,000 visitors and over 30,000 peak concurrent viewers.See alsoList of chatterbotsOmegleReferencesExternal linksOfficial websiteCleverscript websiteCleverbot.io websiteLivestream of 2 cleverbots chatting with each other on Twitch.tv",0.775899
"Prismais a photo-editing application that utilizes a neural network and artificial intelligence to transform the image into an artistic effect.The app was created by Alexey Moiseenkov (Russian:Алексей Моисеенков), Oleg Poyaganov, Ilya Frolov, Andrey Usoltsev and it was launched in June 2016 as a free mobile app. A week after its launch, the app gained popularity and received over 7.5 million downloads and over 1 million active users as of July 2016. It debuted on iOS on Apple App Store during the first week of June and it became the leading app at the App Store in Russia and other neighboring countries. On 19 July 2016, the developer launched a beta version of the app for Android and it closed few hours later by developers after receiving feedback from its users. It was later released publicly on 24 July 2016 on Google Play.In July 2016, the developer announced that the video and virtual reality version of the app is currently under development.On July 7, 2017, Prisma launched a new app called Sticky which turns selfies into stickers for sharing to your social feeds.HistoryThe app was created by the team led by Alexey Moiseenkov who also founded the Prisma labs, based in Moscow. Moiseenkov previously worked at Mail.Ru and later resigned from his job to dedicate his time for the development of the app. He said that the development of the app took only one and a half months and the team did not do anything to promote the app.The algorithm that powers the app is based on the open source programming and algorithms behind DeepArt.FeaturesUsers can upload pictures and select a variety of filters to transform the picture into an artistic effect. At launch, the app offered twenty different filters. Additional filters are added daily. In July 2016, Moiseenkov stated that the app will offer forty filters by the end of the month.The image rendering takes place in Prisma labs's servers and it uses a neural network and artificial intelligence to add the artistic effect. The result is delivered back to the user's phone. Unlike other photo editing apps, Prisma renders the image by going through different layers and recreating the image rather than inserting a layer over the image.In August 2016, the iOS version of the app was updated to edit image offline by utilizing the phone's processor for image rendering.ReceptionDownloadsOne week after its debut on iOS App Store, the app was downloaded over 7.5 million times and received over 1 million active users. It also became the top listed app in Russia and its neighboring countries. In the end of July 2016, it was installed over 12.5 million devices with over 1.5 million active users worldwide. According to App Annie, it was listed in the top 10 apps on the App Store in 77 different countries.On the first day of the Android version release, it received over 1.7 million downloads with 50 million pictures processed by the app.Research and technologyThe research paper behind the Prisma App technology is called ""A Neural Algorithm of Artistic Style"" by Leon Gatys, Alexander Ecker and Matthias Bethge and was presented at the premier machine learning conference: Neural Information Processing Systems (NIPS) in 2015. This technology was developed independently and before Prisma, and both the university and the company have no affiliation with one another.Further recent work developed by Stanford University: Perceptual Losses for Real-Time Style Transfer and Super-Resolution by Justin Johnson, Alexandre Alahi and Li Fei-Fei has also been able to create real-time style transfer through video.The code for the previous papers is available at no charge at GitHub for research purposes. The Prisma App (on the industrial front), and Style Transfer and Super Resolution (on the research front) has been made possible thanks to research and development in human perception, texture analysis, convolutional neural networks.See alsoList of Prisma (app) filtersReferencesExternal linksOfficial website",0.771186


#### Since the randomly selected sentence didn't contain any info on Alpha Go, the search came out with:
1. Mystery Vibels, a British sex toy manufacturer
2. Traxis, Vision Tech firm hq'd in Singapore
3. Qloo, Leaonardo DiCaprio funded Artifical Intelligence company
4. Cleverbotis, Chatterbot Web Application
5. Prismals, Photo-editing Application

### Let's repeat the same steps for Business software

In [54]:
#Wiki collection process for business software

bs_page_list = set(page_list("Category:Business software"))

bs_content_list=[]
for title in bs_page_list:
    bs_content_list.append(clean_content(title))

bs_page_list_2 = [x.replace('.',' ') for x in bs_page_list]

bs_new_list = []
for i in range(len(bs_page_list_2)):
    bs_new_dict = {bs_page_list_2[i]:bs_content_list[i]}
    bs_new_list.append(bs_new_dict)

for i in bs_new_list:
    db_wiki_bs_ref.insert_one(i)

In [30]:
bs_dict = list(db_wiki_bs_ref.find())

In [57]:
#count for Business software pages 
db_wiki_bs_ref.count(), db_wiki_bs_whole_ref.count()

(4583, 4584)

In [103]:
bs_page_list = set(page_list("Category:Business software"))

In [112]:
for x in list(bs_page_list):
    temp_df = get_content_df(x)
    bs_content_df = bs_content_df.append(temp_df)

In [105]:
bs_content_df=pd.DataFrame()

In [114]:
bs_content_df.to_pickle("bs_content_df")

In [60]:
bs_content_df = pd.read_pickle("bs_content_df")

In [61]:
bs_content_df.sample(5)

Unnamed: 0,page_id,title,content
0,21670195,Java_Persistence_Query_Language,TheJava Persistence Query Language(JPQL) is a ...
0,469578,Decision_support_system,Adecision support system(DSS) is a system base...
0,32039577,LibreOffice_Writer,LibreOffice Writeris the free and open-source ...
0,24902683,Office_Open_XML_file_formats,TheOffice Open XML file formatsare a set of fi...
0,4325491,Bing_(search_engine),Bingis a web search engine owned and operated ...


In [64]:
#create a label for target

le = LabelEncoder()
bs_content_df['title_num'] = le.fit_transform(bs_content_df['title'])

In [69]:
#Prepare TFIDF Term Frequency * inverse Document Frequency

bs_tfidf_vectorizer = TfidfVectorizer(min_df = 1, stop_words = 'english')

bs_tfidf_term_matrix_sps = bs_tfidf_vectorizer.fit_transform(bs_content_df.content)

bs_tfidf_term_matrix_df = pd.DataFrame(bs_tfidf_term_matrix_sps.toarray(),
                                       index=bs_content_df.content,
                                       columns=bs_tfidf_vectorizer.get_feature_names())

MemoryError: 