In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import precision_score, recall_score
import re
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### Corpus de documents

In [3]:
data = pd.read_csv("/content/drive/MyDrive/dataset/bbc_dataset.csv")
data.head()

Unnamed: 0,Article,Document,Label
0,Broadband challenges TV viewing,The number of Europeans with broadband has exp...,tech
1,Freeze on anti-spam campaign,A campaign by Lycos Europe to target spam-rela...,tech
2,ITunes user sues Apple over iPod,A user of Apple's iTunes music service is suin...,tech
3,A decade of good website design,The web looks very different today than it did...,tech
4,Loyalty cards idea for TV addicts,Viewers could soon be rewarded for watching TV...,tech


In [4]:
data.drop_duplicates(inplace=True)
print(f"il y a {data.shape[0]} documents")
print(data['Label'].value_counts())


il y a 2125 documents
Label
sport            503
business         503
politics         403
entertainment    369
tech             347
Name: count, dtype: int64


### Nettoyage des documents

In [5]:
stemmer = PorterStemmer()
def clean_text(text):
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    words = word_tokenize(text)
    words_stps = [word for word in words if word not in stopwords.words('english')]
    stemmed_words = [stemmer.stem(word) for word in words_stps]
    return " ".join(stemmed_words)

In [6]:
corpus=data.copy()
corpus['Document'] = corpus['Document'].apply(clean_text)
corpus.head()

Unnamed: 0,Article,Document,Label
0,Broadband challenges TV viewing,number european broadband explod past month we...,tech
1,Freeze on anti-spam campaign,campaign lyco europ target spamrel websit appe...,tech
2,ITunes user sues Apple over iPod,user appl itun music servic su firm say unfair...,tech
3,A decade of good website design,web look differ today year ago back yahoo laun...,tech
4,Loyalty cards idea for TV addicts,viewer could soon reward watch tv loyalti card...,tech


### Indexation TF-IDF

In [7]:
vectorizer = TfidfVectorizer()
term_doc_matrix = vectorizer.fit_transform(corpus['Document'])
tfidf_matrix = term_doc_matrix.toarray()
feature_names = vectorizer.get_feature_names_out()
tfidf_matrix = pd.DataFrame(tfidf_matrix, columns=feature_names)
tfidf_matrix.head()

Unnamed: 0,aa,aaa,aac,aadc,aaliyah,aaltra,aamir,aara,aarhu,aaron,...,zuton,zvonareva,zvyagintsev,âand,âayear,âbn,âbnbn,âm,âmplu,âmâm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.07331,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Topic Modeling (LSA)

In [8]:
svd = TruncatedSVD(n_components=5, random_state=42)
lsi_matrix = svd.fit_transform(term_doc_matrix)
def get_top_words(svd, feature_names, n_top_words=10):
    topic_words = []
    for topic_idx, topic in enumerate(svd.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topic_words.append(top_words)
    return topic_words
topic_words = get_top_words(svd, feature_names)
for topic_idx, topic in enumerate(topic_words):
    print(f"Topic {topic_idx + 1}: {', '.join(topic)}")

Topic 1: said, mr, would, year, game, peopl, labour, us, govern, elect
Topic 2: mr, labour, elect, parti, blair, tax, tori, brown, govern, minist
Topic 3: labour, win, elect, england, blair, play, parti, game, mr, tori
Topic 4: film, award, best, nomin, oscar, actor, star, actress, music, festiv
Topic 5: mobil, phone, use, technolog, peopl, game, user, servic, comput, music


### Requête


1.   Analyse de la requête
2.   Détection des topics
3.   Documents pertinents



In [9]:
def find_similar_docs(query_vector, lsi_matrix):
    similarities = cosine_similarity(query_vector, lsi_matrix).flatten()
    most_similar_indices = similarities.argsort()[:-11:-1]
    return most_similar_indices,similarities

In [10]:
#sanstopicmodeling
def find_similar_docssans(query_vector, tfidf_matrix):
    similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
    most_similar_indices = similarities.argsort()[:-11:-1]
    return most_similar_indices,similarities

In [29]:
query_text="Microsoft"
clean_text(query_text)
query_vector = vectorizer.transform([query_text])
query_vector = svd.transform(query_vector)
similar_docs,similarities = find_similar_docs(query_vector, lsi_matrix)
print(f"Requête : {query_text}")
for doc_index in similar_docs:
         print(f"  Document {doc_index} : {similarities[doc_index]} : {data['Document'].iloc[doc_index]}")

Requête : Microsoft
  Document 213 : 0.9777662464891781 : A mobile phone that recognises and responds to movements has been launched in Japan. The motion-sensitive phone - officially titled the V603SH - was developed by Sharp and launched by Vodafone's Japanese division. Devised mainly for mobile gaming, users can also access other phone functions using a pre-set pattern of arm movements. The phone will allow golf fans to improve their swing via a golfing game. Those who prefer shoot-'em-ups will be able to use the phone like a gun to shoot the zombies in the mobile version of Sega's House of the Dead. The phone comes with a tiny motion-control sensor, a computer chip that responds to movement. Other features include a display screen that allows users to watch TV and can rotate 180 degrees. It also doubles up as an electronic musical instrument. Users have to select a sound from a menu that includes clapping, tambourine and maracas and shake their phone to create a beat. It is being re

In [19]:
query_text="Microsoft"
clean_text(query_text)
query_vector = vectorizer.transform([query_text])
similar_docs,similarities = find_similar_docssans(query_vector, tfidf_matrix)
print(f"Requête : {query_text}")
for doc_index in similar_docs:
         print(f"  Document {doc_index} : {similarities[doc_index]} : {data['Document'].iloc[doc_index]}")

Requête : Microsoft
  Document 66 : 0.3486118404370454 : Windows users could soon be paying Microsoft to keep PCs free of spyware. Following the takeover of anti-spyware firm Giant, Microsoft said it would soon release a toolkit that strips machines of the irritating programs. Although initially free, Microsoft has not ruled out charging people who want to keep this toolkit up to date. Surveys show that almost every Windows PC is infested with spyware programs that do everything from bombard users with adverts to steal login data. Microsoft said that a beta version of the toolkit to clean up Windows machines should be available within 30 days. Designed for PCs running Windows 2000 and XP, the utility will clean out spyware programs, constantly monitor what happens on a PC and will be regularly updated to catch the latest variants. Before now many of Microsoft's other security boosting programs, such as the firewall in Windows XP, have been given away free. But Mike Nash, vice president

### Implémentation du moteur de recherche

In [None]:
def find_similar_docs_2(query_vector, lsi_matrix):
    similarities = cosine_similarity(query_vector, lsi_matrix).flatten()
    most_similar_indices = similarities.argsort()[::-1]
    return most_similar_indices,similarities

In [None]:
!pip install flask pyngrok

In [None]:
!pip install flask-ngrok

In [None]:
file='/content/drive/MyDrive/templates'
files='/content/drive/MyDrive/static'

In [None]:
def query_process(query):
  query_text=clean_text(query)
  query_vector = vectorizer.transform([query_text])
  query_vector = svd.transform(query_vector)
  return query_vector

In [None]:
from flask import Flask, render_template,request
from pyngrok import ngrok,conf
from flask_ngrok import run_with_ngrok
conf.get_default().auth_token = "2qRwqyoL3XKciS6OYqN8Z76Vjpo_2a1uTCtmKHUNkaMVd7Hbq"

In [None]:
app = Flask(__name__,template_folder=file,static_folder=files)
run_with_ngrok(app)
@app.route('/')
def home():
    return render_template("index_search.html")
@app.route('/search',methods=['POST'])
def search():
  if request.method == 'POST':
    query = request.form['query']
    query_vector=query_process(query)
    similar_docs,similarities = find_similar_docs_2(query_vector, lsi_matrix)
    result_dict={}
    doc_number=0
    for doc_index in similar_docs:
      if similarities[doc_index]>0.88:
        doc_number+=1
        result_dict[doc_index] = {
        'article': data['Article'].iloc[doc_index],
        'document': data['Document'].iloc[doc_index]
         }
    return render_template("index_search.html", results=result_dict,requete=query,results_number=doc_number)
  return render_template("index_search.html")

In [None]:
import time
if __name__ == '__main__':
    ngrok.connect(5000)
    time.sleep(5)
    app.run()

