In [1]:
import nltk
from nltk.corpus import stopwords
import string
from nltk.stem import PorterStemmer
stemmer=PorterStemmer()
from collections import Counter
import math
import pandas as pd
import os
os.getcwd()

def indexation(corpus, x):
    if isinstance(corpus, str):
        corpus = [corpus] 

    table = str.maketrans('', '', string.punctuation) 
    docs = [doc.lower().translate(table).split() for doc in corpus]

    clean_list = []
    stop_words = set(stopwords.words('english'))
    for d in docs:
        sub_list = [t for t in d if t not in stop_words]
        clean_list.append(sub_list)

    stems = []
    for d in clean_list:
        sub_list = [stemmer.stem(t) for t in d]
        stems.append(sub_list)

    tf_all = []
    for d in stems:
        total_terms = len(d)
        counts = Counter(d)
        tf = {t: counts[t] / total_terms for t in counts}
        tf_all.append(tf)

    D = len(stems)
    all_terms = set([word for doc in stems for word in doc])
    doc_freq = {term: sum(1 for doc in stems if term in doc) for term in all_terms}
    idf = {term: math.log10(D / doc_freq[term]) for term in doc_freq}

    tf_idf = []
    for d in tf_all:
        row = {t: d.get(t, 0) * idf[t] for t in all_terms}
        tf_idf.append(row)

    df = pd.DataFrame(tf_all)
    df.index = [f"Doc{i+1}" for i in range(len(tf_all))]
    df.to_excel(x, index=True, engine='openpyxl')
    return (os.getcwd(), len(df.columns))


In [2]:
corpus = [
    "Artificial intelligence is transforming the world.",
    "Machine learning is a branch of artificial intelligence.",
    "Deep learning models require large amounts of data.",
    "Natural language processing enables computers to understand text.",
    "AI applications include speech recognition and computer vision."
]
indexation(corpus,'matrice.xlsx')

('C:\\Users\\ACER\\search engine', 26)

In [3]:
requete = "applications of artificial intelligence in language processing"

indexation(requete,'requete.xlsx')

('C:\\Users\\ACER\\search engine', 5)

In [4]:
import pandas as pd

def produit_interne(corpus_file, query_file, output_file):
    df_corpus = pd.read_excel(corpus_file, index_col=0)
    df_query = pd.read_excel(query_file, index_col=0)

    all_terms = sorted(set(df_corpus.columns) | set(df_query.columns))
    df_corpus = df_corpus.reindex(columns=all_terms, fill_value=0)
    df_query = df_query.reindex(columns=all_terms, fill_value=0)
    
    df_corpus = df_corpus.apply(pd.to_numeric, errors='coerce').fillna(0)
    df_query = df_query.apply(pd.to_numeric, errors='coerce').fillna(0)

    q = df_query.iloc[0].values
   
    
    scores = df_corpus.dot(q) 
    scores=scores[scores !=0]
    results = scores.sort_values(ascending=False)
    results_df = pd.DataFrame(results, columns=["Similarity"])

    results_df.to_excel(output_file, engine='openpyxl')

    print("Produit interne calcule :", output_file)
    return results_df



In [5]:

results = produit_interne('matrice.xlsx', 'requete.xlsx', 'resultat.xlsx')

print(results)

Produit interne calcule : resultat.xlsx
      Similarity
Doc1    0.100000
Doc2    0.080000
Doc4    0.057143
Doc5    0.028571


In [6]:
import pandas as pd
import numpy as np

def cosine_similarity(corpus_file, query_file, output_file):
    df_corpus = pd.read_excel(corpus_file, index_col=0)
    df_query = pd.read_excel(query_file, index_col=0)

    all_terms = sorted(set(df_corpus.columns) | set(df_query.columns))
    df_corpus = df_corpus.reindex(columns=all_terms, fill_value=0)
    df_query = df_query.reindex(columns=all_terms, fill_value=0)

    df_corpus = df_corpus.apply(pd.to_numeric, errors='coerce').fillna(0)
    df_query = df_query.apply(pd.to_numeric, errors='coerce').fillna(0)

    q = df_query.iloc[0].values
    q_norm = np.linalg.norm(q)

    sims = []
    for i in range(len(df_corpus)):
        d = df_corpus.iloc[i].values
        d_norm = np.linalg.norm(d)
        if d_norm == 0 or q_norm == 0:
            sim = 0
        else:
            sim = np.dot(d, q) / math.sqrt(d_norm * q_norm)
        sims.append(sim)
    
    similarities = pd.Series(sims, index=df_corpus.index).sort_values(ascending=False)
    similarities=similarities[similarities !=0]
    similarities.to_excel(output_file, header=["CosineSimilarity"], engine='openpyxl')

    print("Cosine similarity ", output_file)
    return similarities


In [7]:
cosine_similarity('matrice.xlsx', 'requete.xlsx','cosine.xlsx')

Cosine similarity  cosine.xlsx


Doc1    0.211474
Doc2    0.178885
Doc4    0.138989
Doc5    0.069494
dtype: float64

In [9]:
import re

def read_cranfield_bodies(file_path):
    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()
    raw_docs = re.split(r"\.I\s+\d+", text)
    raw_docs = [doc.strip() for doc in raw_docs if doc.strip()]

    corpus = []
    for raw in raw_docs:
        match = re.search(r"\.W\s*(.*)", raw, re.S)
        if match:
            body = re.sub(r"\s+", " ", match.group(1).strip())
            corpus.append(body)

    return corpus

cor = read_cranfield_bodies(r"C:\Users\ACER\search engine\cran.all.1400")


In [10]:
indexation(cor,'file.xlsx')

('C:\\Users\\ACER\\search engine', 6393)

In [11]:
req='what similarity laws must be obeyed when constructing aeroelastic models of heated high speed aircraft '
indexation(req,'req.xlsx')

('C:\\Users\\ACER\\search engine', 11)

In [12]:
cosine_similarity('file.xlsx', 'req.xlsx', 'output.xlsx')

Cosine similarity  output.xlsx


Doc51      0.095589
Doc879     0.063771
Doc878     0.061727
Doc874     0.059944
Doc486     0.058351
             ...   
Doc199     0.002166
Doc1239    0.002112
Doc189     0.002071
Doc417     0.002013
Doc673     0.001764
Length: 797, dtype: float64