In [1]:
import nltk
from nltk.corpus import stopwords
import string
corpus = [
    "The player. loves to run, and play ",
    "Running in the sun keeps you healthy",
    "Playing outside makes a boy , healthy and happy"
]
def tokenize(corpus):
    table = str.maketrans('', '', string.punctuation) 
    docs=[doc.lower().translate(table).split() for doc in corpus]
    return (docs)
    
docs=tokenize(corpus)

docs


[['the', 'player', 'loves', 'to', 'run', 'and', 'play'],
 ['running', 'in', 'the', 'sun', 'keeps', 'you', 'healthy'],
 ['playing', 'outside', 'makes', 'a', 'boy', 'healthy', 'and', 'happy']]

In [2]:
def stop_wordss(docs):
    clean_list=[]
    stop_words = set(stopwords.words('english'))
    for d in docs :
        sub_list=[]
        for t in d:
            if t not in stop_words :
                sub_list.append(t)
        clean_list.append(sub_list)
    return(clean_list)
    

stop_wordss(tokenize(corpus))


[['player', 'loves', 'run', 'play'],
 ['running', 'sun', 'keeps', 'healthy'],
 ['playing', 'outside', 'makes', 'boy', 'healthy', 'happy']]

In [3]:
from textblob import Word
lemmas=[]

for d in stop_wordss(tokenize(corpus)) :
    sub_list=[]
    for t in d:
        sub_list.append(Word(t).lemmatize())
    lemmas.append(sub_list)
print(lemmas)


[['player', 'love', 'run', 'play'], ['running', 'sun', 'keep', 'healthy'], ['playing', 'outside', 'make', 'boy', 'healthy', 'happy']]


In [4]:
from nltk.stem import PorterStemmer
stemmer=PorterStemmer()
def stemmingg(clean_list):
    stems=[]
    for d in clean_list :
        sub_list=[]
        for t in d:
            sub_list.append(stemmer.stem(t))
        stems.append(sub_list)
    return(stems)
    
stemmingg(stop_wordss(tokenize(corpus)))


[['player', 'love', 'run', 'play'],
 ['run', 'sun', 'keep', 'healthi'],
 ['play', 'outsid', 'make', 'boy', 'healthi', 'happi']]

In [5]:
from collections import Counter
stems=stemmingg(stop_wordss(tokenize(corpus)))

tf_all = []  
for d in stems:
    total_terms = len(d)
    counts = Counter(d)
    tf = {t: counts[t] / total_terms for t in counts} 
    tf_all.append(tf)
print(tf_all)


[{'player': 0.25, 'love': 0.25, 'run': 0.25, 'play': 0.25}, {'run': 0.25, 'sun': 0.25, 'keep': 0.25, 'healthi': 0.25}, {'play': 0.16666666666666666, 'outsid': 0.16666666666666666, 'make': 0.16666666666666666, 'boy': 0.16666666666666666, 'healthi': 0.16666666666666666, 'happi': 0.16666666666666666}]


In [6]:
import math

D=len(stemmingg(stop_wordss(tokenize(corpus))))
print (D)
all_terms = set([word for doc in stems for word in doc]) 
doc_freq = {} 
for term in all_terms:
    doc_freq[term] = sum(1 for doc in stems if term in doc)

idf = {term: math.log10(D / doc_freq[term]) for term in doc_freq}
print (idf)



3
{'player': 0.47712125471966244, 'run': 0.17609125905568124, 'play': 0.17609125905568124, 'love': 0.47712125471966244, 'make': 0.47712125471966244, 'boy': 0.47712125471966244, 'keep': 0.47712125471966244, 'sun': 0.47712125471966244, 'happi': 0.47712125471966244, 'healthi': 0.17609125905568124, 'outsid': 0.47712125471966244}


In [7]:
tf_idf = []
for d in tf_all:               
    row = {}
    for t in all_terms:       
        row[t] = d.get(t, 0) * idf[t]
    tf_idf.append(row)
print (tf_idf)

[{'player': 0.11928031367991561, 'run': 0.04402281476392031, 'play': 0.04402281476392031, 'love': 0.11928031367991561, 'make': 0.0, 'boy': 0.0, 'keep': 0.0, 'sun': 0.0, 'happi': 0.0, 'healthi': 0.0, 'outsid': 0.0}, {'player': 0.0, 'run': 0.04402281476392031, 'play': 0.0, 'love': 0.0, 'make': 0.0, 'boy': 0.0, 'keep': 0.11928031367991561, 'sun': 0.11928031367991561, 'happi': 0.0, 'healthi': 0.04402281476392031, 'outsid': 0.0}, {'player': 0.0, 'run': 0.0, 'play': 0.029348543175946873, 'love': 0.0, 'make': 0.07952020911994373, 'boy': 0.07952020911994373, 'keep': 0.0, 'sun': 0.0, 'happi': 0.07952020911994373, 'healthi': 0.029348543175946873, 'outsid': 0.07952020911994373}]


In [10]:
import pandas as pd

df = pd.DataFrame(tf_idf)
df.index = [f"Doc{i+1}" for i in range(len(tf_idf))]

print(df.round(2))


      player   run  play  love  make   boy  keep   sun  happi  healthi  outsid
Doc1    0.12  0.04  0.04  0.12  0.00  0.00  0.00  0.00   0.00     0.00    0.00
Doc2    0.00  0.04  0.00  0.00  0.00  0.00  0.12  0.12   0.00     0.04    0.00
Doc3    0.00  0.00  0.03  0.00  0.08  0.08  0.00  0.00   0.08     0.03    0.08


In [11]:
df.to_excel("tf_idf_matrix.xlsx", index=True, engine='openpyxl')

In [12]:
import os
os.getcwd()

'C:\\Users\\ACER\\search engine'

In [13]:
corpus = [
    "The player loves to run and play ",
    "Running in the sun keeps you healthy",
    "Playing outside makes a boy healthy and happy"
]


def indexation(corpus,x):
    stems=stemmingg(stop_wordss(tokenize(corpus)))
    tf_all = []  
    for d in stems:
        total_terms = len(d)
        counts = Counter(d)
        tf = {t: counts[t] / total_terms for t in counts} 
        tf_all.append(tf)
    D=len(stemmingg(stop_wordss(tokenize(corpus))))
    all_terms = set([word for doc in stems for word in doc]) 
    doc_freq = {} 
    for term in all_terms:
        doc_freq[term] = sum(1 for doc in stems if term in doc)
    idf = {term: math.log10(D / doc_freq[term]) for term in doc_freq}
    tf_idf = []
    for d in tf_all:               
        row = {}
        for t in all_terms:       
            row[t] = d.get(t, 0) * idf[t]
        tf_idf.append(row)
    df = pd.DataFrame(tf_idf)
    
    df.index = [f"Doc{i+1}" for i in range(len(tf_idf))]
    df.to_excel(x, index=True, engine='openpyxl')
    return (os.getcwd(),len(df.columns))
    
indexation(corpus,"tf_idf.xlsx")


('C:\\Users\\ACER\\search engine', 11)

In [12]:
import re

def read_cranfield_bodies(file_path):
    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()
    raw_docs = re.split(r"\.I\s+\d+", text)
    raw_docs = [doc.strip() for doc in raw_docs if doc.strip()]

    corpus = []
    for raw in raw_docs:
        match = re.search(r"\.W\s*(.*)", raw, re.S)
        if match:
            body = re.sub(r"\s+", " ", match.group(1).strip())
            corpus.append(body)

    return corpus

cor = read_cranfield_bodies(r"C:\Users\ACER\search engine\cran.all.1400")


In [None]:
x="tf.xlsx"
indexation(cor,x)

In [None]:
co = read_cranfield_bodies(r"trec.txt")


In [None]:
indexation(co,"res.xlsx")

In [14]:
corpus = [
    "langage java informatique php java langage php java ",
    "langage java php langage "]

In [15]:

def indexation(req):
    stems=stemmingg(stop_wordss(tokenize(req)))
    tf_all = []  
    for d in stems:
        total_terms = len(d)
        counts = Counter(d)
        tf = {t: counts[t] / total_terms for t in counts} 
        tf_all.append(tf)
    
    return (tf_all)
CORPUS=indexation(corpus)    
CORPUS

[{'langag': 0.25, 'java': 0.375, 'informatiqu': 0.125, 'php': 0.25},
 {'langag': 0.5, 'java': 0.25, 'php': 0.25}]

In [44]:
def get_term(term,docnum):
    for t in CORPUS[docnum]:
        if (term==t):
            return (CORPUS[docnum][t])
get_term('langag',0)        

0.25

In [50]:
req="langag AND java OR php"
def similrite(req,d):
    list=req.split()
    i = 0
    while i < len(list):
        if i - 1 >= 0 and i + 1 < len(list):
            if (list[i]=="AND"):
                if (type(list[i-1])==str):
                    v1=get_term(list[i-1],d)
                else:
                    v1=list[i-1]
                if (type(list[i+1])==str):
                    v2=get_term(list[i+1],d)
                else:
                    v2=list[i+1]
                list[i]=min(v1,v2)
                del list[i-1]
                del list[i]
                i =i- 1
            else:
                i =i+ 1
        else:
            i =i+ 1 

    i = 0
    while i < len(list):
        if i - 1 >= 0 and i + 1 < len(list):
            if (list[i]=="OR"):
                if (type(list[i-1])==str):
                    v1=get_term(list[i-1],d)
                else:
                    v1=list[i-1]
                if (type(list[i+1])==str):
                    v2=get_term(list[i+1],d)
                else:
                    v2=list[i+1]
                list[i]=max(v1,v2)
                del list[i-1]
                del list[i]
                i =i- 1
            else:
                i =i+ 1
        else:
            i =i+ 1 
    return list
  
print(similrite(req,0))

[0.25]
