In [None]:
import pandas as pd
import openai
import os
import time
import re
import csv
import tiktoken
import timeout_decorator
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import pickle
import nltk
import ast
from openai import OpenAI
from dotenv import load_dotenv
from apify_client import ApifyClient
from nltk.corpus import stopwords
from collections import deque
from urllib.parse import urlparse
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from keybert import KeyBERT
import plotly.express as px
import plotly.graph_objects as go

In [None]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_KEY")
#openai.api_type = "azure"
#openai.api_version = "2023-07-01-preview"
#openai.api_base = os.getenv("AZURE_BASE")
#openai.api_key = os.getenv("AZURE_KEY")
client = OpenAI()
optimal_k = 3 #NUMBER OF CLUSTERS

In [None]:
def get_embedding(text):
    text = text.replace("\n", " ")
    response = client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

def search(df, text, n=3, pprint=True):
    embedding = np.array(get_embedding(text)).reshape(1, -1)
    df['similarity'] = df.embedding.apply(lambda x: cosine_similarity(np.array(x).reshape(1, -1), embedding))
    res = df.sort_values('similarity', ascending=False).head(n)
    return res

def sim(text, target):
    embedding = get_embedding(text)
    return cosine_similarity(embedding, target)

In [None]:
df = pd.read_csv('federalai_embed.csv')

In [None]:
df['embedding'] = df['embedding'].apply(eval).apply(np.array)

In [None]:
df['embedding'] = df.Summary.apply(lambda x: get_embedding(x)) #Replace text with column name of text

In [None]:
df.to_csv("federalai_embed.csv")

In [None]:
df['Office']

In [None]:
df['Techniques'] = df['Techniques'].str.replace('Artificial Intelligence Unknown', 'Artificial Intelligence', regex=False)

In [None]:
# Assuming df['Techniques'] is already defined
techniques_series = df['Techniques'].str.split(', ')
exploded_techniques = techniques_series.explode()

# Count occurrences and remove 'Other' before selecting the top 5
technique_counts = exploded_techniques.value_counts().reset_index()
technique_counts.columns = ['Technique', 'Count']

# Filter out 'Other', if present
#technique_counts = technique_counts[technique_counts['Technique'] != 'Other']

# Select the top 5 techniques
top_technique_counts = technique_counts.head(5)

# Create the Plotly bar graph for the top 5 techniques
bar_fig = px.bar(top_technique_counts, x='Technique', y='Count', title='Frequency of Top 5 Techniques')

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
 
all_techniques = ' '.join(exploded_techniques.fillna(''))

# Generate the word cloud
wordcloud = WordCloud(width = 800, height = 400, background_color ='white').generate(all_techniques)

# Display the word cloud using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')  # Don't show axes for a cleaner look
plt.show()

In [None]:
df

In [19]:
from sklearn.preprocessing import StandardScaler

embeddings_matrix = np.vstack(df['embedding'])
scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embeddings_matrix)
embeddings = df['embedding'].tolist()
if isinstance(embeddings[0], str):
    embeddings = [ast.literal_eval(e) for e in embeddings]

In [20]:
embeddings_matrix = np.vstack(df['embedding'])
scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embeddings_matrix)
embeddings = df['embedding'].tolist()
if isinstance(embeddings[0], str):
    embeddings = [ast.literal_eval(e) for e in embeddings]
kmeans = KMeans(n_clusters=5)
df['cluster'] = kmeans.fit_predict(scaled_embeddings)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [None]:
# Determine the optimal number of clusters using the elbow method
inertia = []
K = range(1, 11)  # Assuming we want to test 1 through 10 clusters
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_embeddings)
    inertia.append(kmeans.inertia_)

# Plotting the elbow plot
plt.figure(figsize=(10, 6))
plt.plot(K, inertia, 'bo-')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method For Optimal k')
plt.show()

In [None]:
tsne_embeddings = TSNE(n_components=3, random_state=42).fit_transform(np.array(embeddings))
x = tsne_embeddings[:, 0]
y = tsne_embeddings[:, 1]
z = tsne_embeddings[:, 2]

scatter = go.Scatter3d(
    x = x,
    y = y,
    z = z,
    mode = 'markers',
    marker = dict(
        size = 5,  # Increase the marker size for better visibility
        color = df['cluster'],  # Color by cluster
        colorscale = 'Viridis',
        opacity = 0.8,
    ),
    hovertemplate = 
        '%{text}<br><br>' +  # Bold name on hover
        '<b>Cluster: %{customdata}<br>' +  # Include cluster information
        'Coordinates: (%{x}, %{y}, %{z})<extra></extra>',  # Include coordinates
    text = ['<br>'.join(text[i:i+30] for i in range(0, len(text), 20)) for text in df['Summary']],
    customdata = df['cluster']
)

layout = go.Layout(
    title = '3D t-SNE Clustering',
    scene = dict(
        xaxis = dict(title='Dimension 1', zeroline=False),
        yaxis = dict(title='Dimension 2', zeroline=False),
        zaxis = dict(title='Dimension 3', zeroline=False),
    ),
    hoverlabel = dict(
        bgcolor = "white",  # Background color for hover label
        font_size = 12,  # Text font size
        font_family = "Arial"  # Text font family
    )
)

fig = go.Figure(data=[scatter], layout=layout)
fig.show()

In [None]:
kw_model = KeyBERT()
stop_words = stopwords.words('english')
df['keywords'] = df['Summary'].apply(lambda x: kw_model.extract_keywords(x, keyphrase_ngram_range=(1, 1), stop_words=stop_words))
#clusters = kmeans.labels_

num_examples = 15 # Adjust this value based on your needs

for cluster_idx in range(8):
    cluster_data = np.array(scaled_embeddings)[clusters == cluster_idx]
    distances = [np.linalg.norm(embedding - kmeans.cluster_centers_[cluster_idx]) for embedding in cluster_data]
    closest_examples_idx = np.argsort(distances)[:num_examples]

    print(f"\nCluster {cluster_idx}:")
    for idx in closest_examples_idx:
        print(f"- {df.loc[idx, 'Summary']}")

In [None]:
all_keywords = set()
for keywords in df['keywords']:
    all_keywords.update(kw for kw, _ in keywords if isinstance(kw, tuple))

cluster_keyphrases_unigrams = {}
for cluster in df['cluster'].unique():
    cluster_data = df[df['cluster'] == cluster]
    print(cluster_data)
    keyphrase_scores_unigrams = {}
    for keyword in all_keywords:
        scores_unigrams = []
        for kws in cluster_data['keywords']:
            for item in kws:
                kw, score = item
                scores_unigrams.append(score)
        scores_unigrams = [score for kws in cluster_data['keywords'] if kws and kws[0][0] == keyword for _, score in kws]
        if scores_unigrams:
            median_value = np.median(scores_unigrams)
            keyphrase_scores_unigrams[keyword] = (median_value * len(scores_unigrams)).round(3)
    cluster_keyphrases_unigrams[cluster] = sorted(keyphrase_scores_unigrams.items(), key=lambda x: x[1], reverse=True)[:15]
print(cluster_keyphrases_unigrams)

In [21]:
df['Keywords1'] = df['Summary'].apply(lambda x: kw_model.extract_keywords(x, keyphrase_ngram_range=(1, 1), stop_words=stop_words))
df['Keywords2'] = df['Summary'].apply(lambda x: kw_model.extract_keywords(x, keyphrase_ngram_range=(2, 2), stop_words=stop_words))

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download the required NLTK data if not already present
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize the stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Extract all unique keywords and perform stemming or lemmatization
all_keywords = set()
for keywords in df['Keywords1']:
    for keyword, _ in keywords:
        keyword = lemmatizer.lemmatize(keyword)
        all_keywords.add(keyword)

# Extract all unique keywords and perform stemming or lemmatization
for keywords in df['Keywords2']:
    for keyword, _ in keywords:
        keyword = lemmatizer.lemmatize(keyword)
        all_keywords.add(keyword)

cluster_keyphrases_bigrams = {}
cluster_keyphrases_unigrams = {}

for cluster in df['cluster'].unique():
    cluster_data = df[df['cluster'] == cluster]

    keyphrase_scores_bigrams = {}
    keyphrase_scores_unigrams = {}

    for keyword in all_keywords:
        
        # Scores for bigrams
        scores_bigrams = [score for kws in cluster_data['Keywords2'] for kw, score in kws if lemmatizer.lemmatize(kw) == keyword]
        if scores_bigrams:
            median_value = np.median(scores_bigrams)
            count = len(scores_bigrams)
            keyphrase_scores_bigrams[keyword] = (median_value * count).round(3)
        
        # Scores for unigrams
        scores_unigrams = [score for kws in cluster_data['Keywords1'] for kw, score in kws if lemmatizer.lemmatize(kw) == keyword]
        if scores_unigrams:
            median_value = np.median(scores_unigrams)
            count = len(scores_unigrams)
            keyphrase_scores_unigrams[keyword] = (median_value * count).round(3)

    sorted_keyphrases_bigrams = sorted(keyphrase_scores_bigrams.items(), key=lambda x: x[1], reverse=True)[:15]
    sorted_keyphrases_unigrams = sorted(keyphrase_scores_unigrams.items(), key=lambda x: x[1], reverse=True)[:15]

    cluster_keyphrases_bigrams[cluster] = sorted_keyphrases_bigrams
    cluster_keyphrases_unigrams[cluster] = sorted_keyphrases_unigrams

cluster_df_bigrams = pd.DataFrame.from_dict(cluster_keyphrases_bigrams)
cluster_df_unigrams = pd.DataFrame.from_dict(cluster_keyphrases_unigrams)

# Change names of columns based on top unigrams
new_columns = ['_'.join([keyword for keyword, _ in cluster_df_unigrams[col].head(3)]) for col in cluster_df_bigrams.columns]
cluster_df_bigrams.columns = new_columns
cluster_df_unigrams.columns = new_columns

print(cluster_df_bigrams)

[nltk_data] Downloading package punkt to /Users/skacholia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/skacholia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/skacholia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
