In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import confusion_matrix, recall_score, precision_score, accuracy_score
import seaborn as sns
from collections import Counter
import re

In [None]:
df = pd.read_excel(r"C:\Users\Dnin\Desktop\bidnet_dataset.xlsx")
df.head()

In [None]:
model = SentenceTransformer('all-mpnet-base-v2')
#model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

In [None]:
# keywords = ["digital transformation", "artificial intelligence", "machine learning", "analytics", "data engineering",\
#             "salesforce", "grants", "licensing and permits", "lightning experience, lightning migration",\
#             "case management", "data warehouse", "automation", "rpa ; robotic process automation" , "consultancy", "advisory",\
#            "application development and maintenance"]

keywords = ["digital transformation", "artificial intelligence", "machine learning", "analytics", "data engineering",\
            "salesforce", "grants", "licensing", "permits", "lightning experience", "lightning migration",\
            "case management", "data warehouse", "automation", "rpa", "robotic process automation" , "consultancy", "advisory",\
           "application development"]

keyword_embeddings = np.array([model.encode(keyword) for keyword in keywords])

In [None]:
def calculate_cosine_similarity(description_embedding):
    similarities= {}
    for keyword, keyword_embedding in zip(keywords, keyword_embeddings):
        similarity = cosine_similarity([keyword_embedding], [description_embedding])[0][0]
        similarities[keyword] = similarity
        #print(f"Similarity with '{keyword}':", similarities[keyword])
        
    return similarities

def predict_tags(row, model, keywords, keyword_embeddings, threshold):
    description = row['Description']
#     print(row['Title'])
    words = re.findall(r'\b\w+\b', description)
    number_of_words = len(words)
    actual_tags = "No Tags" if row['Tags'] == "None" else row['Tags']
    description_embedding = model.encode(description)
    
    similarities_tags = calculate_cosine_similarity(description_embedding)

#     threshold = 0.3
#     For second model threshold around 0.6
#     predicted_tags = [keyword for keyword, score in similarities_tags.items() if score > threshold]

    filtered_sorted_tags = sorted([(keyword, score) for keyword, score in similarities_tags.items() if score > threshold], key=lambda x: x[1], reverse=True)
    top_3_tags = [tag for tag, _ in filtered_sorted_tags[:3]]
#     print("Description:", description)
#     print("Actual Tags: ", actual_tags)
#     print("Predicted Tags: ", predicted_tags)
#     print("Most Relevant Tag: " max_similarity_tag)
    
    return top_3_tags, number_of_words

def check_relevance(row):
    relevance = "Yes" if len(row['Most Relevant Tags']) > 0 else "No"
    return relevance

In [None]:
# Assuming 'model', 'keywords', 'keyword_embeddings' are defined
df[['Most Relevant Tags','Word Count']] = df.apply(lambda row: predict_tags(row, model, keywords, keyword_embeddings, 0.3), axis=1, result_type='expand')

In [None]:
df['Predicted Relevance'] = df.apply(check_relevance, axis=1)
df

In [None]:
# df['Relevant?'].value_counts()
filtered_df = df.loc[df['Word Count'] > 350]
filtered_df

In [None]:
sns.histplot(df['Word Count'],bins=10,kde=True)
plt.xlabel('Word Count')
plt.ylabel('Bids frequency')
plt.title('Distribution of Word Count Values')
plt.show()

In [None]:
tag_list = [tag.strip() for tags in df['Tags'] for tag in tags]
keyword_counts = Counter(tag_list)

# Create a list of counts for each keyword
keyword_frequencies = [keyword_counts[keyword] for keyword in keywords]

# Create a bar plot for the keyword frequencies
plt.figure(figsize=(10, 6))
plt.barh(keywords, keyword_frequencies)
plt.xlabel('Keywords')
plt.ylabel('Frequency')
plt.title('Keyword Frequency Distribution in Tags')
plt.show()

In [None]:
df['Relevant?'].value_counts()

In [None]:
df['Predicted Relevance'].value_counts()

In [None]:
df['Tags'] = df['Tags'].apply(lambda x: x.split(',') if x != "None" else [])

In [None]:
tag_list = [tag for tags in df['Most Relevant Tags'] for tag in tags]
keyword_counts = Counter(tag_list)

# Create a list of counts for each keyword
keyword_frequencies = [keyword_counts[keyword] for keyword in keywords]

# Create a bar plot for the keyword frequencies
plt.figure(figsize=(10, 6))
plt.barh(keywords, keyword_frequencies)
plt.xlabel('Keywords')
plt.ylabel('Frequency')
plt.title('Keyword Frequency Distribution in Tags')
plt.show()

In [None]:
df['Relevant?'] = df['Relevant?'].map({'Yes': True, 'No': False})
df['Predicted Relevance'] = df['Predicted Relevance'].map({'Yes': True, 'No': False})

precision = precision_score(df['Relevant?'], df['Predicted Relevance'])
recall = recall_score(df['Relevant?'], df['Predicted Relevance'])
accuracy = accuracy_score(df['Relevant?'], df['Predicted Relevance'])

print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)

In [None]:
conf_mat = confusion_matrix(df['Relevant?'], df['Predicted Relevance'])
conf_mat

In [None]:
# Plotting Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Relevance')
plt.ylabel('Actual Relevance')
plt.show()

In [None]:
filtered_df = df.loc[(df['Relevant?'] == True) & (df['Predicted Relevance'] == False)]
print(filtered_df['Description'])
print(filtered_df['Tags'])

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
df.loc[df["Title"] == "DIVISION OF HOUSING LOAN SERVICING SOFTWARE NEEDED"]

In [None]:
model.max_seq_length