In [1]:
print(1)

1


In [2]:
import nltk
import gensim
import networkx as nx
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import string

from collections import defaultdict

In [18]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [19]:
# Preprocessing Tasks
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(text) # Tokenization
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words] # Stopword Removal
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens] # Lemmatization
    return tokens


In [6]:
# Read file and preprocess functions
def read_and_preprocess_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return preprocess_text(text)


In [7]:
# Train Word2Vec procedure
def train_word2vec_model(preprocessed_text, vector_size=100, window=5, min_count=1, workers=4):
    model = gensim.models.Word2Vec(
        [preprocessed_text],
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=workers
    )
    return model

In [10]:
# Coocurrence Based Graph constructor
def create_coocurrence_graph(preprocessed_text, window_size=2):
    graph = nx.Graph()
    word_count = defaultdict(int)

    for i, word in enumerate(preprocessed_text):
        word_count[word] += 1

        for j in range(max(0, i-window_size), min(len(preprocessed_text), i+window_size+1)):
            if i != j:
                if not graph.has_edge(preprocessed_text[i], preprocessed_text[j]):
                    graph.add_edge(preprocessed_text[i], preprocessed_text[j], weight=1)
                else:
                    graph[preprocessed_text[i]][preprocessed_text[j]]['weight'] += 1
    return graph


In [11]:
# Word2Vec Based Graph Constructor
def create_word2vec_graph(model, top_n=10):
    graph = nx.Graph()
    vocab = model.wv.key_to_index

    for word in vocab:
        similar_words = model.wv.most_similar(word, topn=top_n)
        for similar_word, similarity in similar_words:
            graph.add_edge(word, similar_word, weight=similarity)
    return graph

In [12]:
# Graph visualization function
def visualize_graph(graph, title="Knowledge Graph"):
    plt.figure(figsize=(12, 12))
    pos = nx.spring_layout(graph, k=0.5)
    nx.draw(graph, pos, with_labels=True, node_size=1500, node_color='skyblue', font_size=10, font_color='black', font_weight='bold', width=1.2)
    edge_labels = nx.get_edge_attributes(graph, 'weight')
    nx.draw_networkx_edge_labels(graph, pos, edge_labels=edge_labels, font_size=8)
    plt.title(title)
    plt.show()

In [13]:
# Save graph to the file
def save_graph_to_file(graph, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for u, v, data in graph.edges(data=True):
            weight = data.get('weight', 1)
            f.write(f"{u}, {v}, {weight}\n")
    print("Sucess write the graph")

In [14]:
file_path = "/content/BigRock.txt"

In [20]:
preprocessed_text = read_and_preprocess_text(file_path)

In [22]:
# Train the word to vector
word2vec_model = train_word2vec_model(preprocessed_text)

In [23]:
word2vec_model

<gensim.models.word2vec.Word2Vec at 0x780e2fcc6210>

In [24]:
coocurrence_graph = create_coocurrence_graph(preprocessed_text)

In [25]:
coocurrence_graph

<networkx.classes.graph.Graph at 0x780e2fd6d8d0>

In [26]:
similarity_graph = create_word2vec_graph(word2vec_model)

In [29]:
save_graph_to_file(coocurrence_graph, "coocurrence_graph.txt")
save_graph_to_file(similarity_graph, "similarity_graph.txt")

Sucess write the graph
Sucess write the graph


In [None]:
visualize_graph(coocurrence_graph, "Coocurrence Graph")

In [None]:
visualize_graph(similarity_graph, "Word2Vec Graph")