In [1]:
#loading libraries
import pandas as pd 
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")


# loading the data from json file
import json   
with open("ArticleDataset.json") as f:
  data = json.load(f)

In [2]:
# converting into dataframe from json format
data = pd.DataFrame(data) 

In [3]:
data.head() # printing first 5 roes of the data

Unnamed: 0,date,title,text,url,unknown
0,2020-01-22,TE Connectivity introduces M12 industrial Ethe...,TE Connectivity (TE) now offers M12 data cable...,https://www.automationmag.com/te-connectivity-...,empty
1,2020-01-22,Kirigami designs hold thousands of times their...,"The Japanese art of origami (from ori, folding...",https://techxplore.com/news/2020-01-kirigami-t...,empty
2,2020-01-22,GM's Cruise heads down new road with new robot...,General Motors' self-driving car company will ...,https://techxplore.com/news/2020-01-gm-cruise-...,empty
3,2020-01-22,First fully integrated flexible electronics ma...,Human skin is a fascinating multifunctional or...,https://techxplore.com/news/2020-01-fully-flex...,empty
4,2020-01-22,Study says that we trust our workplace robots,The only constant is change. Presumptions hard...,https://techxplore.com/news/2020-01-workplace-...,empty


In [4]:
text = data.text # assigning the our text data to text varible 

In [5]:
text # printing the text column

0       TE Connectivity (TE) now offers M12 data cable...
1       The Japanese art of origami (from ori, folding...
2       General Motors' self-driving car company will ...
3       Human skin is a fascinating multifunctional or...
4       The only constant is change. Presumptions hard...
                              ...                        
8431    About The Author\nFrank Tobe\nFrank Tobe is th...
8432    Frank Tobe\nFrank Tobe is the founder of The R...
8433    Frank Tobe\nFrank Tobe is the founder of The R...
8434    Frank Tobe\nFrank Tobe is the founder of The R...
8435    Frank Tobe\nFrank Tobe is the founder of The R...
Name: text, Length: 8436, dtype: object

In [6]:
len(text) # no of rows

8436

In [7]:
#function to remove the html tags from text 

def cleanhtml(sentence):             
    cleanr = re.compile('<.*?>')  
    cleantext = re.sub(cleanr,' ',sentence)
    return cleantext
sentence = 'I am <abc> siva'
cleanhtml(sentence)

'I am   siva'

In [8]:
# function to remove punctuation from text
import re
def cleanpunc(sentence):
    cleaned = re.sub('[?|!|\'|"|#]',' ',sentence)
    cleaned = re.sub('[.|,|(|)|\|/]',' ',cleaned)
    return cleaned
sentence = 'I am?(Siva'
cleanpunc(sentence)

'I am  Siva'

In [9]:
snostem = nltk.stem.SnowballStemmer('english')
def text_preprocessing(final): 
    str=''
    final_string=[]
    s=''
    for sent in tqdm(final.values):
        filtered_sentence=[]
        sent = cleanhtml(sent)
        for w in sent.split():
            for cleaned_words in cleanpunc(w).split(): #after cleanpunc again one word may become two or multiple so they'll be split and stored as different words
                if((cleaned_words.isalpha()&(len(cleaned_words)>2))):
                    #if(cleaned_words.lower() not in stop):
                    s = (snostem.stem(cleaned_words.lower()).encode('utf8'))
                    filtered_sentence.append(s)
        str = b" ".join(filtered_sentence)
        final_string.append(str)
    return final_string

final_text = []
final_text = text_preprocessing(text)

100%|█████████████████████████████████████████████████████████████████████████████| 8436/8436 [01:12<00:00, 116.14it/s]


In [10]:
final_text[0] # 

b'connect now offer data cabl assembl that support ethernet protocol part it packag industri ethernet and fieldbus solut user the new industri ethernet cabl assembl can choos from wide rang featur jacket for exampl pvc pur other frnc flame retard materi pvc provid competit and econom solut for occasion movement vibrat but lack area such flex properti while pur more suit more demand applic for exampl drag chain and harsh environ where oper reliabl crucial depend the materi the assembl withstand million flex machin oil abras and radiat compact hous with crimp flang crimp sleev provid shield against emi rfi interfer the hexagon crimp the sleev give and cabl strain relief accord with iec categori class well safe shield termin categori cabl can realiz transmiss rate meet network requir the field and supervisori manag level the new assembl are avail with sever standard cabl length option from through'

In [11]:
final_text[8435]

b'frank tobe frank tobe the founder the robot report and robo global which has develop track index for the robot industri the robo robot autom index the index compani track and captur the entir econom valu this global opportun robot autom and enabl technolog this site use akismet reduc spam learn how your comment data process'

In [12]:
#fitting the data into tfidf vector 
from sklearn.feature_extraction.text import TfidfVectorizer
count_vect = TfidfVectorizer()
final_tfidf = count_vect.fit_transform(final_text)

In [13]:
final_tfidf.shape # 8436 rows and 45005 dimensions

(8436, 45005)

In [14]:
# reading the query data
query_data = "quantum computing software from IBM"
text_preprocessing(pd.Series(query_data))

100%|████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?it/s]


[b'quantum comput softwar from ibm']

In [15]:
# transforming the query data to vector 
query_tfidf = count_vect.transform([query_data])
query_tfidf.shape

(1, 45005)

In [16]:
data.reset_index(inplace=True)

In [17]:
# finding the nearest 100 article to given query using cosine similarity

In [18]:
def nearest_neighbors(values, all_values, nbr_neighbors=100):
    
    nn = NearestNeighbors(nbr_neighbors, metric='cosine', algorithm='brute').fit(all_values) #cosine similarity 
    
    dists, idxs = nn.kneighbors(values)
    
    print(dists)
    print(idxs)
    return idxs

from sklearn.neighbors import NearestNeighbors

ids = nearest_neighbors(query_tfidf, final_tfidf)


data.loc[list(ids[0])]

[[0.17432813 0.65984195 0.66397505 0.71679408 0.74733302 0.74939402
  0.76618151 0.79588156 0.79606815 0.80652538 0.81848351 0.82597695
  0.84210089 0.87356901 0.87891046 0.88317986 0.88675961 0.88844694
  0.89212404 0.89413136 0.89511236 0.89941679 0.89955203 0.90095808
  0.90105045 0.9024188  0.90296239 0.90325462 0.90866678 0.90978494
  0.91543839 0.91572471 0.91593111 0.92162084 0.92214051 0.92448259
  0.92686756 0.9273293  0.92779326 0.93011002 0.93560343 0.938978
  0.9395355  0.94066074 0.94131146 0.94223812 0.94545449 0.9476894
  0.95020612 0.95058628 0.95079163 0.95129797 0.95192677 0.95295388
  0.95365108 0.95374203 0.95418391 0.9550015  0.95615702 0.95683299
  0.95837422 0.95938264 0.95957363 0.9595957  0.95971917 0.95981608
  0.96069942 0.96073339 0.96112341 0.96122988 0.96182312 0.96184146
  0.96251472 0.96286939 0.96322265 0.96376188 0.96463725 0.96514566
  0.96605571 0.96606515 0.96625335 0.96653667 0.96693388 0.96823392
  0.96861912 0.96871514 0.96888243 0.96950773 0.969

Unnamed: 0,index,date,title,text,url,unknown
1557,1557,2019-03-04,IBM unveils new commercial Q System One quantu...,IBM has released the world's first-ever commer...,https://www.themanufacturer.com/articles/ibm-u...,empty
5894,5894,2015-03-31,IBM Investing Billions in 'Internet of Things',NEW YORK - IBM (IW 500/11) announced Tuesday i...,https://www.industryweek.com/technology-and-ii...,empty
228,228,2019-12-10,Intel Introduces cryogenic control chip 'Horse...,Intel has announced the development of a cryog...,https://techxplore.com/news/2019-12-intel-cryo...,empty
353,353,2019-11-18,An artificial intelligence algorithm can learn...,Artificial intelligence can be used to predict...,https://techxplore.com/news/2019-11-artificial...,empty
3578,3578,2017-09-11,IBM and MIT partner on AI research lab,IBM and the Mass. Institute of Technology plan...,https://www.therobotreport.com/ibm-mit-partner...,empty
...,...,...,...,...,...,...
6396,6396,2013-10-14,Hard Road to World Domination for Chinese Firms,"BEIJING - In the global contest for business, ...",https://www.industryweek.com/the-economy/compe...,empty
7012,7012,1999-03-16,What's Really Driving Apple's Recovery,Steve Jobs lured Timothy D. Cook from industry...,https://www.industryweek.com/leadership/compan...,empty
6149,6149,2014-08-12,What is the Samsung Way?,“Samsung went from a no-name company in the gl...,https://www.industryweek.com/leadership/compan...,empty
2385,2385,2018-08-08,July 2018 Robotics and Intelligent Systems Inv...,Investment into robotics and intelligent syste...,https://www.therobotreport.com/july-2018-robot...,empty


In [19]:
final_text[1557]

b'ibm has releas the world commerci quantum comput the system one howev more progress need befor will take over from today super comput updat march ibm has announc that has achiev the best perform from quantum comput date scientif mileston that includ the lowest error rate has ever measur the high unstabl realm quantum comput accord ibm research perform was measur via quantum metric that account for gate and measur error well devic cross talk and connect and circuit softwar compil effici recent unveil ibm system one quantum comput with processor has produc quantum volum rough doubl that the current ibm ibm network devic which have quantum volum the result mean perform has doubl everi year sinc the result will present the american physic societi ap meet today march quantum volum follow law for quantum comput and doubl annual for the next decad then quantum comput will more practic and reach the desir quantum advantag status quantum advantag refer the point where quantum applic deliv sig

In [20]:
final_text[228]

b'intel has announc the develop cryogen control chip call hors ridg the chip can control multipl qubit quantum comput it announc intel claim that develop the chip repres major mileston the path toward truli viabl quantum comput also part it announc intel claim that other player the quantum comput develop world have neglect import part ani such way control mani qubit the same time intel report that they develop the new chip collabor with delft and tno use technolog develop they suggest the new chip will dramat increas the potenti for develop truli use quantum comput quantum comput are base qubit which are notori unstabl make them more stabl quantum comput engin build redund thus prevent error from ruin whi mani qubit are need the qubit are hous freezer that keep them veri near absolut zero they are control microwav puls that origin outsid the freezer this mean that each unit has host cabl stick out connect extern control intel new chip goe insid the freezer with the qubit elimin the cab

In [21]:
final_text[5894]

b'new york ibm announc tuesday was pump billion into new divis focus the grow market for connect devic the internet thing the move add ibm resourc the market for smart devic such refriger car cloth and other object which can connect the internet link smartphon ibm will make it analyt avail maker these devic help improv their function these resourc will made avail open platform provid manufactur with the abil design and produc new generat connect devic that are better optim for the iot ibm said statement our knowledg the world grow with everi connect sensor and devic but too often are not act even when know can ensur better result said bob picciano senior vice presid ibm analyt ibm estim that all data generat devic such smartphon tablet connect vehicl and applianc never analyz act and that much the data begin lose valu not quick analyz one exampl ibm said would introduc servic that help insur compani extract insight from connect vehicl separ but relat announc ibm said would team with th