In [1]:
from imports import *
from PreProcessor import *
import os
import numpy as np

In [2]:

CodeTokens = set()
UCTokens = set()
UC_documents = list()
code_documents = list()
TotalTokens=set()
entropy_uc = []
entropy_code = []
variance_uc = {}
variance_code = {}
SCQ_uc={}
SCQ_code={}
term_co_occurrences_uc = {}
PMI_uc = {}
term_co_occurrences_code= {}
PMI_code = {}


# Q, the set of query terms; q, a term in the query; D, the set of documents in the collection;
# Dt, the set of documents containing term t
# d, a document in the document collection D;
# tf(t, D), the frequency of term t in all docs;
# tf(t, d), the frequency of term t in d;
# tf(t, Q), the frequency of term t in the query;
# sim(di, dj), the cosine similarity between the vector-space representations of di and dj
# idf(t) = log( |D|/|Dt|).
# ictf(t) = log( |D|/tf(t,D) ).




for filename in os.listdir("./UC"):
    filepath = os.path.join("./UC", filename)
    tokens = PreProcessor().UCPreProcessor(filepath)
    UC_documents.append(tokens)
    UCTokens.update(tokens.split())


for filename in os.listdir("./CC"):
    filepath = os.path.join("./CC", filename)
    tokens = PreProcessor().CodePreProcessor(filepath)
    code_documents.append(tokens)
    CodeTokens.update(tokens.split())

TotalTokens = CodeTokens.union(UCTokens)


vectorizer_uc = TfidfVectorizer(vocabulary=TotalTokens)
tf_matrix_uc=vectorizer_uc.fit_transform(UC_documents)
idf_uc = vectorizer_uc.idf_



df_uc = np.sum(tf_matrix_uc > 0, axis=0).A1

total_documents_uc = len(code_documents)
ictf_uc = np.log(total_documents_uc / (df_uc + 1))

for term_index, term in enumerate(vectorizer_uc.get_feature_names_out()):
    tf_term_doc = tf_matrix_uc[:, term_index].toarray().sum()+1  #tf(t,d)
    SCQ_uc[term] = (1 + np.log(tf_term_doc)) * idf_uc[term_index]



for term_index, term in enumerate(vectorizer_uc.get_feature_names_out()):
    term_entropy = 0
    for doc in UC_documents:
        tf_term_doc = doc.count(term)
        tf_term_collection = df_uc[term_index]
        
        smoothing_factor = 1  
        tf_term_doc = tf_term_doc + smoothing_factor
        tf_term_collection = tf_term_collection + smoothing_factor
        term_entropy += (tf_term_doc / tf_term_collection) * np.log((tf_term_doc / tf_term_collection)+1)
    entropy_uc.append(term_entropy)


for term_index, term in enumerate(vectorizer_uc.get_feature_names_out()):
    term_weights = []
    for doc_index, doc in enumerate(UC_documents):
        tf_term_doc = doc.count(term)
        weight_term_doc = (1 / len(doc)) * np.log(1 + tf_term_doc) * idf_uc[term_index]
        term_weights.append(weight_term_doc)

    avg_weight_term = np.mean(term_weights)
    variance_term = np.mean([(weight - avg_weight_term) ** 2 for weight in term_weights])
    variance_uc[term] = variance_term


for doc in UC_documents:
  
    unique_terms = set(doc)
    for term1 in unique_terms:
        for term2 in unique_terms:
            if term1 != term2:
                term_co_occurrences_uc[(term1, term2)] = term_co_occurrences_uc.get((term1, term2), 0) + 1




for term_pair, co_occurrence_count in term_co_occurrences_uc.items():
    term1, term2 = term_pair
    pt1_t2_D = co_occurrence_count / len(UC_documents)
    pt1_D = sum(1 for doc in UC_documents if term1 in doc) / len(UC_documents)
    pt2_D = sum(1 for doc in UC_documents if term2 in doc) / len(UC_documents)
    pt_D = pt1_D * pt2_D

    if pt_D != 0:
        PMI_uc[term_pair] = np.log(pt1_t2_D / pt_D)





vectorizer_code = TfidfVectorizer(vocabulary=TotalTokens)
tf_matrix_code=vectorizer_code.fit_transform(code_documents)
idf_code = vectorizer_code.idf_
df_code = np.sum(tf_matrix_code > 0, axis=0).A1
total_documents_code = len(code_documents)
ictf_code = np.log(total_documents_code / (df_code + 1))


for term_index, term in enumerate(vectorizer_code.get_feature_names_out()):
    tf_term_doc = tf_matrix_code[:, term_index].toarray().sum()+1
    SCQ_code[term] = (1 + np.log(tf_term_doc)) * idf_code[term_index]


for term_index, term in enumerate(vectorizer_code.get_feature_names_out()):
    term_entropy = 0
    for doc in code_documents:
        tf_term_doc = doc.count(term)
        tf_term_collection = df_code[term_index]

        smoothing_factor = 1  
        tf_term_doc = tf_term_doc + smoothing_factor
        tf_term_collection = tf_term_collection + smoothing_factor
        term_entropy += (tf_term_doc / tf_term_collection) * np.log((tf_term_doc / tf_term_collection)+1)
    entropy_code.append(term_entropy)


for term_index, term in enumerate(vectorizer_code.get_feature_names_out()):
    term_weights = []
    for doc_index, doc in enumerate(code_documents):
        tf_term_doc = doc.count(term)
        weight_term_doc = (1 / len(doc)) * np.log(1 + tf_term_doc) * idf_code[term_index]
        term_weights.append(weight_term_doc)

    avg_weight_term = np.mean(term_weights)
    variance_term = np.mean([(weight - avg_weight_term) ** 2 for weight in term_weights])
    variance_code[term] = variance_term




for doc in code_documents:
    unique_terms = set(doc)
    for term1 in unique_terms:
        for term2 in unique_terms:
            if term1 != term2:
                term_co_occurrences_code[(term1, term2)] = term_co_occurrences_code.get((term1, term2), 0) + 1



for term_pair, co_occurrence_count in term_co_occurrences_code.items():
    term1, term2 = term_pair
    pt1_t2_D = co_occurrence_count / len(code_documents)
    pt1_D = sum(1 for doc in code_documents if term1 in doc) / len(code_documents)
    pt2_D = sum(1 for doc in code_documents if term2 in doc) / len(code_documents)
    pt_D = pt1_D * pt2_D

    if pt_D != 0:
        PMI_code[term_pair] = np.log(pt1_t2_D / pt_D)


def AvgIDF(idf_values):
    return np.mean(idf_values)

def MaxIDF(idf_values):
    return np.max(idf_values)

def DevIDF(idf_values):
    avg_idf =AvgIDF(idf_values)
    diffs = [(idf - avg_idf) for idf in idf_values]
    return np.sqrt(sum(diffs) / len(diffs))

def AvgICTF(ictf_values):
     return np.mean(ictf_values)

def MaxICTF(ictf_values):
    return max(ictf_values)


def DevICTF(ictf_values):
    avg_ictf =AvgICTF(ictf_values)
    diffs = [(ictf - avg_ictf) for ictf in ictf_values]
    return math.sqrt(sum(diffs) / len(diffs))




def AvgEntropy(entropy_values):
    return np.mean(entropy_values)


def MedEntropy(entropy_values):
    return np.median(entropy_values)

def MaxEntropy(entropy_values):
    return max(entropy_values)

def DevEntropy(entropy_values):
    avg_entropy =AvgEntropy(entropy_values)
    diffs = [(entropy - avg_entropy)** 2 for entropy in entropy_values] #made an assumption en fe square l2n fe negative values w bgd i no longer know gaya mnen , i can aslo asumme abs bs msh 3aref
    if(np.mean(diffs)<0):
        print("NEGATIVE A3AAAAAAAAAA")
        print(np.mean(diffs))
    return math.sqrt(np.mean(diffs))



def QS(Tokens,Docuemnts):           #so not sure of it wm3mltsh el SCS wel CS brdo yooh
    documents_with_query_terms=0
    for document in Docuemnts:
        contains_query_term = any(term in document for term in Tokens)
        if contains_query_term:
            documents_with_query_terms += 1
    query_scope = documents_with_query_terms / len(Docuemnts)

    return query_scope

 
def AvgVAR(var_values):
    return sum(list(var_values.values()))/len(var_values)

def MaxVAR(var_values):
    return max(var_values.values())

def SumVAR(var_values):
    return sum(var_values.values()) 



def AvgSCQ(scq_values):
    return sum(list(scq_values.values()))/len(scq_values) 

def MaxSCQ(scq_values):
    return max(scq_values.values())

def SumSCQ(scq_values):
    return sum(scq_values.values()) 


def AvgPMI(PMI_values):
    num_terms = len(PMI_values)
    return 2 * sum(PMI_values.values()) * math.exp(math.lgamma(num_terms - 1) - math.lgamma(num_terms))


def MaxPMI(PMI_values):
    return max(PMI_values.values())




print("AvgIDF:", AvgIDF(idf_uc))
print("MaxIDF:", MaxIDF(idf_uc))
print("DevIDF:", DevIDF(idf_uc))
print("AvgICTF:", AvgICTF(ictf_uc))
print("MaxICTF:", MaxICTF(ictf_uc))
print("DevICTF:", DevICTF(ictf_uc))
#print(entropy_uc)
print("AvgEntropy:", AvgEntropy(entropy_uc))
print("MedEntropy:", MedEntropy(entropy_uc))
print("MaxEntropy:", MaxEntropy(entropy_uc))
print("DevEntropy:", DevEntropy(entropy_uc))

print("QS:", QS(UCTokens,UC_documents))


print("AvgVAR:", AvgVAR(variance_uc))
print("MaxVAR:", MaxVAR(variance_uc))
print("SumVAR:", SumVAR(variance_uc))
print("AvgSCQ:", AvgSCQ(SCQ_uc))
print("MaxSCQ:", MaxSCQ(SCQ_uc))
print("SumSCQ:", SumSCQ(SCQ_uc))
#print(PMI_uc)
print("AvgPMI:", AvgPMI(PMI_uc))
print("MaxPMI:", MaxPMI(PMI_uc))


print("AvgIDF for Code:", AvgIDF(idf_code))
print("MaxIDF for Code:", MaxIDF(idf_code))
print("DevIDF for Code:", DevIDF(idf_code))
print("AvgICTF for Code:", AvgICTF(ictf_code))
print("MaxICTF for Code:", MaxICTF(ictf_code))
print("DevICTF for Code:", DevICTF(ictf_code))


#print(entropy_code)
print("AvgEntropy for Code:", AvgEntropy(entropy_code))
print("MedEntropy for Code:", MedEntropy(entropy_code))
print("MaxEntropy for Code:", MaxEntropy(entropy_code))
print("DevEntropy for Code:", DevEntropy(entropy_code))

print("QS for Code:", QS(CodeTokens,code_documents))

print("AvgVAR for Code:", AvgVAR(variance_code))
print("MaxVAR for Code:", MaxVAR(variance_code))
print("SumVAR for Code:", SumVAR(variance_code))

print("AvgSCQ for Code:", AvgSCQ(SCQ_code))
print("MaxSCQ for Code:", MaxSCQ(SCQ_code))
print("SumSCQ for Code:", SumSCQ(SCQ_code))


print("AvgPMI for Code:", AvgPMI(PMI_code))
print("MaxPMI for Code:", MaxPMI(PMI_code))




AvgIDF: 4.82526895044485
MaxIDF: 5.07753744390572
DevIDF: 1.585410059886101e-08
AvgICTF: 4.501321697645494
MaxICTF: 4.7535901911063645
DevICTF: 3.676128774581396e-08
AvgEntropy: 44.00564850716837
MedEntropy: 40.20253647247681
MaxEntropy: 923.3237116658274
DevEntropy: 54.026552915509114
QS: 1.0
AvgVAR: 5.050796994833571e-06
MaxVAR: 0.00015949586034741763
SumVAR: 0.009030825026762425
AvgSCQ: 5.078069476532483
MaxSCQ: 8.113287403020829
SumSCQ: 9079.588224040079
AvgPMI: 0.007684088421279792
MaxPMI: 0.5050949490570055
AvgIDF for Code: 4.371698719030342
MaxIDF for Code: 5.762173934797756
DevIDF for Code: 1.018918609432195e-08
AvgICTF for Code: 3.363114975338951
MaxICTF for Code: 4.7535901911063645


ValueError: math domain error

In [3]:
print(idf_uc.shape)

(1788,)
