In [1]:
import json
import random

# List of computer science-related keywords
cs_keywords = [
    "algorithm", "data structure", "programming", "computer", "software", 
    "network", "database", "machine learning", "AI", "artificial intelligence",
    "computing", "code", "coding", "neural network", "deep learning", "blockchain",
    "runtime", "compiler", "operating system", "API", "application", "hardware",
    "CPU", "GPU", "RAM", "disk", "cache", "cloud computing", "cybersecurity", "encryption"
    # ... add more keywords as needed
]

def is_cs_related(text):
    # Convert text to lowercase and check if any of the keywords are present
    return any(keyword in text.lower() for keyword in cs_keywords)


selected_data = []

# Open the JSON file for reading
with open("s2ag.valid.0", "r") as json_file:
    for line in json_file:
        try:
            data = json.loads(line)
            # Check if the data has 'text' and 'id' keys and if the text is CS-related
            if 'text' in data and 'id' in data and is_cs_related(data['text']):
                selected_data.append(data)
        except json.JSONDecodeError:
            print("Invalid JSON data:", line)

# Shuffle the selected data to ensure randomness
random.shuffle(selected_data)

# Split the data into train and test sets
train_data = selected_data[:3000]
test_data = selected_data[3000:4000]

# Now you have 'train_data' and 'test_data' containing computer science-related entries
# Each entry is a dictionary with 'id' and 'text' keys

In [3]:
# Assume train_data is a list of dictionaries with keys 'id' and 'text'

# Loop through and create a formatted string for each entry
output_string = ""
for entry in test_data:
    entry_string = "ID: " + str(entry["id"]) + "\n" + "Text: " + entry["text"] + "\n\n"
    output_string += entry_string

# Save to a txt file with utf-8 encoding
with open("shraddha.txt", "w", encoding="utf-8") as file:
    file.write(output_string)

In [5]:
# Read the txt file
with open("shraddha.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

# Modify the lines as needed
for i in range(len(lines)):
    line = lines[i].strip()  # Remove leading/trailing white spaces
    if line.startswith("Text:"):
        # Check if the line doesn't end with punctuation (you can extend this list if needed)
        if not (line.endswith('.') or line.endswith('!') or line.endswith('?')):
            lines[i] = line + '.\n'

# Save the modified content back to the txt file
with open("shraddha2.txt", "w", encoding="utf-8") as file:
    file.writelines(lines)

In [6]:
# Read the txt file
with open("shraddha2.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

# Initialize a list to store the formatted content
formatted_lines = []

# Initialize buffer for collecting text
text_buffer = []

# Loop through the lines and format them
for line in lines:
    line = line.strip()
    if line.startswith("ID:"):
        # If there's content in the buffer, append it as a single paragraph
        if text_buffer:
            formatted_lines.append(" ".join(text_buffer))
            text_buffer = []
        formatted_lines.append(line)
    else:
        # Collect lines in buffer to eventually combine them into a paragraph
        text_buffer.append(line.replace("Text:", "").strip())

# Add any remaining text in the buffer as a paragraph
if text_buffer:
    formatted_lines.append(" ".join(text_buffer))

# Save the formatted content back to a new txt file
with open("shraddha3.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(formatted_lines))


In [7]:
# Read the txt file
with open("shraddha3.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

# Filter out lines that start with "ID:"
filtered_lines = [line for line in lines if not line.startswith("ID:")]

# Save the filtered content back to the txt file (or a new one if you prefer)
with open("shraddha4.txt", "w", encoding="utf-8") as file:
    file.writelines(filtered_lines)

In [8]:
! pip install spacy



In [9]:
import spacy
from spacy.lang.en import English

# Read the text from the file
with open("shraddha4.txt", "r", encoding="utf-8") as file:
    whole_text = file.read()

nlp = English()  # Create an English language object
sentencizer = nlp.add_pipe("sentencizer")  # Add the sentencizer component to the pipeline

# Break text into chunks of 500,000 characters each
chunk_size = 500000
text_chunks = [whole_text[i:i + chunk_size] for i in range(0, len(whole_text), chunk_size)]

all_sentences = []
for chunk in text_chunks:
    doc = nlp(chunk)  # Process the chunk
    # Extract sentences from the doc and extend the all_sentences list
    all_sentences.extend([sent.text for sent in doc.sents])

# Save the sentences to a new txt file
with open("shraddha5.txt", "w", encoding="utf-8") as file:
    for sentence in all_sentences:
        file.write(sentence + "\n\n")  # Each sentence on a new line with a blank line in between


c:\Users\20200\AppData\Local\Programs\Python\Python311\Lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
c:\Users\20200\AppData\Local\Programs\Python\Python311\Lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-gcc_10_3_0.dll


In [11]:
import re
from word2number import w2n

def remove_urls(text):
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return url_pattern.sub('', text)

def remove_special_characters(text):
    # Remove bullets, apostrophes, hyphens, and enumerators without merging words
    text = re.sub(r'[\•\’\-\–]', ' ', text) # Replacing with space
    text = re.sub(r'\d+\.', '', text)  # Removing enumerators
    return text

def process_numericals(text):
    # If the line is just a number or a decimal, return an empty string
    if re.fullmatch(r'\d+(\.\d+)?', text.strip()):
        return ''
    # Convert other numbers in the sentence to text
    text = re.sub(r'\b\d+\b', lambda m: str(w2n.word_to_num(m.group())), text)
    return text



with open("shraddha5.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()

processed_lines = []
for line in lines:
    line = remove_urls(line)
    line = remove_special_characters(line)
    line = process_numericals(line)
    processed_lines.append(line)



with open("shraddha6.txt", "w", encoding="utf-8") as file:
    file.writelines(processed_lines)


In [12]:
import nltk
from nltk.corpus import words as nltk_words

nltk.download('words')

def is_valid_word(word, word_list):
    # Convert to lowercase and check if the word exists in the word list
    return word.lower() in word_list

def process_file(filename):
    # Load English words
    english_words = set(nltk_words.words())

    # Open the file with utf-8 encoding
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    processed_sentences = []
    all_words = set()

    for line in lines:
        # Convert to lowercase
        line = line.strip().lower()

        # Split into words and remove the trailing period
        words = [w.rstrip('.') for w in line.split()]

        # Filter out invalid words
        valid_words = [word for word in words if is_valid_word(word, english_words)]

        if valid_words:  # Only add sentences that have at least one valid word
            processed_sentence = ' '.join(valid_words)
            processed_sentences.append(processed_sentence)
            all_words.update(valid_words)

    # Save to new file
    with open('shraddha7.txt', 'w', encoding='utf-8') as f:
        for sentence in processed_sentences:
            f.write(sentence + '\n')

    # Get ordered unique sentences and words
    unique_sentences = sorted(set(processed_sentences))
    unique_words = sorted(all_words)

    # Print the results
    print("Ordered Unique Sentences:")
    for sentence in unique_sentences:
        print(sentence)
    print("\nSet of Words:")
    for word in unique_words:
        print(word)
    print("\nLength of Unique Sentences:", len(unique_sentences))
    print("Length of Set of Words:", len(unique_words))

process_file('shraddha6.txt')


[nltk_data] Error loading words: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


Ordered Unique Sentences:
a an industrial and a wireless module are included in each device
a analysis of research on intangible cultural heritage tourism the perspective of china
a antibody the in a broad range of species and is one of the most widely used in motor research
a approach for lower truncated normal cumulative application to reliability of used
a bidirectional recurrent neural network that user as a time series classification problem is trained and tested to achieve near optimal we consider and power allocation with causal knowledge of the system which is an infinite state decision problem
a body force model for the fan is and for axisymmetric and
a brief review of its architecture is and a reference architecture is the key involved
a brief review of the different of fabrication of and different fabrication by is articulated in this paper
a calibration method for the identification of the between the absolute and is
a case from us judicial practice is l v
a case study of a

In [14]:
from nltk.corpus import stopwords

# Assuming you have NLTK's stopwords dataset downloaded
# If not, you can download using:
# import nltk
# nltk.download('stopwords')

# Load English stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords_from_file(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Removing stopwords from each line
    cleaned_lines = []
    for line in lines:
        words = line.split()
        cleaned_line = ' '.join([word for word in words if word.lower() not in stop_words])
        cleaned_lines.append(cleaned_line)

    # Writing cleaned lines to a new file
    with open(output_file, 'w', encoding='utf-8') as f:
        for line in cleaned_lines:
            f.write(line + '\n')

# Usage
input_filename = 'shraddha7.txt'
output_filename = 'shraddha8.txt'
remove_stopwords_from_file(input_filename, output_filename)


In [15]:
from scipy.sparse import lil_matrix

def process_file(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Removing duplicates while maintaining order
    unique_sentences = sorted(set(lines), key=lines.index)
    
    # Extract all unique words, ensuring lowercase representation
    all_words = set()
    for sentence in unique_sentences:
        for word in sentence.split():
            all_words.add(word.lower())

    return unique_sentences, sorted(all_words)

unique_sentences, unique_words = process_file('shraddha8.txt')
word_to_index = {word: idx for idx, word in enumerate(unique_words)}

num_sentences = len(unique_sentences)
num_words = len(unique_words)

input_matrix = lil_matrix((num_sentences, num_words), dtype=int)
output_matrix = lil_matrix((num_sentences, num_words), dtype=int)

context_window = 3

for idx, sentence in enumerate(unique_sentences):
    # Input matrix
    for word in sentence.split():
        input_matrix[idx, word_to_index[word.lower()]] = 1

    # Output matrix
    context_start = max(0, idx - (context_window // 2))
    context_end = min(num_sentences, idx + (context_window // 2) + 1)
    
    for i in range(context_start, context_end):
        if i != idx:
            for word in unique_sentences[i].split():
                output_matrix[idx, word_to_index[word.lower()]] = 1

# Converting to CSR format for efficient row-wise operations
input_matrix = input_matrix.tocsr()
output_matrix = output_matrix.tocsr()




In [16]:
from collections import defaultdict
import math
from scipy.sparse import lil_matrix

# Step 1: Calculate IDF for each word in the vocabulary

word_sentence_indices = defaultdict(set)

for idx, sentence in enumerate(unique_sentences):
    for word in set(sentence.split()):  # Using set to avoid counting a word multiple times in the same sentence
        word_sentence_indices[word.lower()].add(idx)

idf = {}
for word, sentence_indices in word_sentence_indices.items():
    idf[word] = math.log(len(unique_sentences) / len(sentence_indices))

# Step 2: Calculate TF for each word in each sentence based on the context window and compute TF-IDF

tf_idf_matrix = lil_matrix((len(unique_sentences), len(unique_words)), dtype=float)

context_window_size = 3  # -1, 0, 1

for idx, sentence in enumerate(unique_sentences):
    word_counts = defaultdict(int)
    
    # Calculate TF (Term Frequency) with the context window
    context_start = max(0, idx - (context_window_size // 2))
    context_end = min(len(unique_sentences), idx + (context_window_size // 2) + 1)
    
    total_words_in_context = 0
    for i in range(context_start, context_end):
        total_words_in_context += len(unique_sentences[i].split())
        for word in unique_sentences[i].split():
            word_counts[word.lower()] += 1
    


    for word, count in word_counts.items():
        tf = count / total_words_in_context
        tf_idf_matrix[idx, unique_words.index(word)] = tf * idf[word]




In [17]:
vocab_size = input_matrix.shape[1]
print(vocab_size)

7879


In [18]:
import tensorflow as tf

vocab_size = input_matrix.shape[1]
num_hiddens = 25

W1 = tf.Variable(tf.random.normal(shape=(vocab_size, num_hiddens), mean=0, stddev=0.01))
W2 = tf.Variable(tf.random.normal(shape=(num_hiddens, vocab_size), mean=0, stddev=0.01))

params = [W1, W2]

In [19]:
import tensorflow as tf
from scipy.sparse import lil_matrix
import random

# ...[Your previous function and data preparation definitions]

def net(X):
    X = tf.reshape(tf.cast(X, dtype=tf.float32), (-1, vocab_size))
    a1 = tf.matmul(X, W1)  # Replaced input_matrix with X
    a2 = tf.matmul(a1, W2)
    return softmax(a2)

def negative_log_likelihood(y_hat, y):
    # Compute the negative log likelihood for true class predictions
    return -tf.reduce_mean(tf.math.log(tf.boolean_mask(y_hat, y) + 1e-10))

def sgd(params, grads, lr):
    """Gradient descent."""
    for param, grad in zip(params, grads):
        param.assign_sub(lr * grad)

def softmax(X):
    X_exp = tf.exp(X)
    partition = tf.reduce_sum(X_exp, 1, keepdims=True)
    return X_exp / partition 

# ...[Your previous data loading and processing definitions]

# Converting sparse matrices to dense tensors for TensorFlow operations
input_tensor = tf.convert_to_tensor(tf_idf_matrix.toarray(), dtype=tf.float32)
output_tensor = tf.convert_to_tensor(output_matrix.toarray(), dtype=tf.float32)



In [24]:
print(input_matrix.shape)
print(tf_idf_matrix.shape)
print(output_matrix.shape)

(9906, 7879)
(9906, 7879)
(9906, 7879)
