In [2]:
import pandas as pd
from sklearn.metrics import jaccard_score
from sklearn.metrics import classification_report
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [19]:
result_tf_idf = pd.read_csv("results_tf_idf.csv")
result_doc2vec = pd.read_csv("results_doc2vec.csv")
correlation = pd.read_csv("ground_truth.csv")

In [20]:
result_tf_idf.head(5)

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_0feaaa5dc39d c_1108dd0c7a5d c_dfa229bd21df
1,t_00068291e9a4,c_035baf9425e0 c_0e9282e0fc8e c_530d7272fb7b c...
2,t_00069b63a70a,c_11a1dc0bfb99 c_3695c5dc1df6 c_675d0cbdb501 c...
3,t_0006d41a73a8,c_02261c51c9d6 c_068fc388295f c_0907f40037c6 c...
4,t_000feba42136,c_304ee4f59410 c_ec6416377746


In [21]:
result_doc2vec.head(5)

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_0feaaa5dc39d c_dfa229bd21df
1,t_00068291e9a4,c_035baf9425e0 c_399f168ecbc6 c_639ea2ef9c95 c...
2,t_00069b63a70a,c_041e5b1c3d6d c_085bdb40bc20 c_0962c05cc441 c...
3,t_0006d41a73a8,c_08f2c6c824f5 c_0c6473c3480d c_1778d4146e9c c...
4,t_000feba42136,c_0235b1b1c100 c_03ef55db4beb c_04c84d96a5ac c...


In [22]:
correlation.head(5)

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...
4,t_000feba42136,c_2bbc650030f4 c_304ee4f59410


In [23]:
# Merge the two DataFrames on 'topic_id' column
merged_df_doc2vec = result_doc2vec.merge(correlation, on ='topic_id', suffixes=('_predicted', '_actual'))
merged_df_tf_idf = result_tf_idf.merge(correlation, on ='topic_id', suffixes=('_predicted', '_actual'))

In [24]:
# Function to calculate precision and recall for each row
def calculate_precision_recall(row):
    predicted_set = set()
    actual_set = set()
    
    if isinstance(row['content_ids_predicted'], str):
        predicted_set.update(row['content_ids_predicted'].split())
        
    if isinstance(row['content_ids_actual'], str):
        actual_set.update(row['content_ids_actual'].split())
    
    # Calculate True Positives (TP), False Positives (FP), and False Negatives (FN)
    TP = len(predicted_set.intersection(actual_set))
    FP = len(predicted_set.difference(actual_set))
    FN = len(actual_set.difference(predicted_set))
    
    # Calculate precision and recall with checks for zero denominators
    precision = (TP / (TP + FP) if TP + FP > 0 else 0.0) * 100
    recall = (TP / (TP + FN) if TP + FN > 0 else 0.0) * 100

    return precision, recall

In [25]:
# Apply the function to each row of the merged DataFrame
merged_df_doc2vec['precision'], merged_df_doc2vec['recall'] = zip(*merged_df_doc2vec.apply(calculate_precision_recall, axis=1))
merged_df_tf_idf['precision'], merged_df_tf_idf['recall'] = zip(*merged_df_tf_idf.apply(calculate_precision_recall, axis=1))


In [26]:
# Calculate overall precision and recall
overall_precision_doc2vec = merged_df_doc2vec['precision'].mean()
overall_recall_doc2vec = merged_df_doc2vec['recall'].mean()
f2measure_doc2vec = ((5*overall_precision_doc2vec) * overall_recall_doc2vec)/ ((4*overall_precision_doc2vec) + overall_recall_doc2vec)

overall_precision_tf_idf = merged_df_tf_idf['precision'].mean()
overall_recall_tf_idf = merged_df_tf_idf['recall'].mean()
f2measure_tf_idf = ((5*overall_precision_tf_idf) * overall_recall_tf_idf)/ ((4*overall_precision_tf_idf) + overall_recall_tf_idf)


In [27]:
# Convert to percentages
overall_precision_doc2vec_percent = overall_precision_doc2vec * 100
overall_recall_doc2vec_percent = overall_recall_doc2vec * 100
f2measure_doc2vec_percent = f2measure_doc2vec * 100

overall_precision_tf_idf_percent = overall_precision_tf_idf * 100
overall_recall_tf_idf_percent = overall_recall_tf_idf * 100
f2measure_tf_idf_percent = f2measure_tf_idf * 100

In [28]:
#Precision and Recall: doc2vec 
print("precision: ",  overall_precision_doc2vec,  "recall: ",  overall_recall_doc2vec, 'F2 measure: ', f2measure_doc2vec)

precision:  8.40955031992329 recall:  52.555590526651876 F2 measure:  25.638092597211145


In [29]:
#Precision and Recall: tf-idf
print("precision: ",  overall_precision_tf_idf,  " recall: ",  overall_recall_tf_idf, 'F2 measure: ', f2measure_tf_idf)

precision:  15.076720339670432  recall:  56.67478536950782 F2 measure:  36.52152996384902


In [30]:
data = [
    {
        "Model": "Doc2Vec",
        "Precision": overall_precision_doc2vec,
        "Recall": overall_recall_doc2vec,
        "F2 Measure": f2measure_doc2vec,
    },
    {
        "Model": "TF-IDF",
        "Precision": overall_precision_tf_idf,
        "Recall": overall_recall_tf_idf,
        "F2 Measure": f2measure_tf_idf,
    },
]

In [31]:
performace_df = pd.DataFrame(data)


In [32]:
performace_df

Unnamed: 0,Model,Precision,Recall,F2 Measure
0,Doc2Vec,8.40955,52.555591,25.638093
1,TF-IDF,15.07672,56.674785,36.52153
