In [2]:
import pandas as pd
from collections import defaultdict
from collections import Counter
import nltk
from nltk.corpus import stopwords
import numpy as np



In [3]:
df = pd.read_csv('data.csv')

selected_rows_df = df.head(500)

# Step 4: Save selected rows DataFrame to CSV
selected_rows_df.to_csv('first_50_rows.csv', index=False)

# Optionally, you can display the selected DataFrame
print(selected_rows_df)

     ARTICLE_ID                 TITLE                 SECTION_TITLE  \
0             0             Anarchism                  Introduction   
1             0             Anarchism     Etymology and terminology   
2             0             Anarchism                       History   
3             0             Anarchism  Anarchist schools of thought   
4             0             Anarchism   Internal issues and debates   
..          ...                   ...                           ...   
495          42                 Algae                External links   
496          43  Analysis of variance                  Introduction   
497          43  Analysis of variance                       History   
498          43  Analysis of variance            Motivating example   
499          43  Analysis of variance    Background and terminology   

                                          SECTION_TEXT  
0    \n\n\n\n\n\n'''Anarchism''' is a political phi...  
1    \n\nThe term ''anarchism'' i

In [4]:
df1=pd.read_csv('first_50_rows.csv')
df1

Unnamed: 0,ARTICLE_ID,TITLE,SECTION_TITLE,SECTION_TEXT
0,0,Anarchism,Introduction,\n\n\n\n\n\n'''Anarchism''' is a political phi...
1,0,Anarchism,Etymology and terminology,\n\nThe term ''anarchism'' is a compound word ...
2,0,Anarchism,History,\n\n===Origins===\nWoodcut from a Diggers docu...
3,0,Anarchism,Anarchist schools of thought,\nPortrait of philosopher Pierre-Joseph Proudh...
4,0,Anarchism,Internal issues and debates,\nconsistent with anarchist values is a contro...
...,...,...,...,...
495,42,Algae,External links,\n\n* – a database of all algal names includi...
496,43,Analysis of variance,Introduction,\n\n'''Analysis of variance''' ('''ANOVA''') i...
497,43,Analysis of variance,History,While the analysis of variance reached fruitio...
498,43,Analysis of variance,Motivating example,No fit.Fair fitVery good fitThe analysis of va...


In [5]:
null_values = df1.isnull().sum()

# Step 4: Display null values count
print("Null Values in Each Column:")
print(null_values)


Null Values in Each Column:
ARTICLE_ID       0
TITLE            0
SECTION_TITLE    0
SECTION_TEXT     0
dtype: int64


In [6]:
df1['SECTION_TEXT'] = df1['SECTION_TEXT'].astype(str)  # Convert to string
df1['SECTION_TEXT'] = df1['SECTION_TEXT'].str.lower()  # Convert to lowercase
df1['SECTION_TEXT'] = df1['SECTION_TEXT'].str.replace('[^\w\s]', '')  # Remove special characters and punctuation


In [7]:
df1.head()

Unnamed: 0,ARTICLE_ID,TITLE,SECTION_TITLE,SECTION_TEXT
0,0,Anarchism,Introduction,\n\n\n\n\n\n'''anarchism''' is a political phi...
1,0,Anarchism,Etymology and terminology,\n\nthe term ''anarchism'' is a compound word ...
2,0,Anarchism,History,\n\n===origins===\nwoodcut from a diggers docu...
3,0,Anarchism,Anarchist schools of thought,\nportrait of philosopher pierre-joseph proudh...
4,0,Anarchism,Internal issues and debates,\nconsistent with anarchist values is a contro...


In [14]:
nltk.download('stopwords')

# Read preprocessed data

# Get NLTK English stopwords
stop_words = set(stopwords.words('english'))

# Tokenize text and create bag of words while filtering out stop words
bag_of_words = Counter()
for text in df1['SECTION_TEXT']:
    words = text.split()  # Split text into words
    # Update bag_of_words with word counts while filtering out stop words
    bag_of_words.update(word for word in words if word.lower() not in stop_words)

# Save vocabulary with unique IDs and frequencies to a file
with open('vocabulary.txt', 'w') as f:
    for idx, (word, count) in enumerate(bag_of_words.items()):
        f.write(f"{idx}: {word}: {count}\n")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Using Bag Of Words which include the frequency and the words present in the SECTION_TEXT

In [15]:
print("Vocabulary (with Unique IDs and Frequencies):")
for idx, (word, count) in enumerate(bag_of_words.items()):
    print(f"ID: {idx}, Word: {word}, Frequency: {count}")


Vocabulary (with Unique IDs and Frequencies):
ID: 0, Word: '''anarchism''', Frequency: 1
ID: 1, Word: political, Frequency: 125
ID: 2, Word: philosophy, Frequency: 65
ID: 3, Word: advocates, Frequency: 9
ID: 4, Word: self-governed, Frequency: 1
ID: 5, Word: societies, Frequency: 16
ID: 6, Word: based, Frequency: 139
ID: 7, Word: voluntary, Frequency: 9
ID: 8, Word: institutions., Frequency: 2
ID: 9, Word: often, Frequency: 155
ID: 10, Word: described, Frequency: 48
ID: 11, Word: stateless, Frequency: 4
ID: 12, Word: although, Frequency: 137
ID: 13, Word: several, Frequency: 146
ID: 14, Word: authors, Frequency: 15
ID: 15, Word: defined, Frequency: 43
ID: 16, Word: specifically, Frequency: 12
ID: 17, Word: institutions, Frequency: 21
ID: 18, Word: non-hierarchical, Frequency: 1
ID: 19, Word: free, Frequency: 56
ID: 20, Word: associations., Frequency: 1
ID: 21, Word: anarchism, Frequency: 86
ID: 22, Word: holds, Frequency: 13
ID: 23, Word: state, Frequency: 352
ID: 24, Word: undesirable,

In [19]:
vocabulary = {}

# Read vocabulary from text file and reconstruct the vocabulary dictionary
with open('vocabulary.txt', 'r') as f:
    for line in f:
        parts = line.strip().split(':')  # Split line into parts
        if len(parts) == 3:  # Ensure the line has all required parts
            word_id, word, _ = parts
            vocabulary[word.strip()] = int(word_id)  # Store word and its ID in the vocabulary dictionary

# Calculate Term Frequency (TF) for the single document
text = df1['SECTION_TEXT'][0]  # Assuming there's only one document in the dataframe
words = text.split()
word_counts = Counter(word for word in words if word.lower() not in stop_words)
total_words = sum(word_counts.values())

# Initialize document TF vector as a dictionary
doc_tf = {}

# Calculate TF for each term
for word, count in word_counts.items():
    word_id = vocabulary.get(word.lower(), -1)  # Get word ID from vocabulary
    if word_id != -1:  # If word is in the vocabulary
        term_frequency = count / total_words  # Calculate TF
        doc_tf[word_id] = term_frequency  # Store TF in the document TF vector

# Display TF matrix for the single document
print("Term Frequency (TF) Matrix for Single Document:")
print(doc_tf)


Term Frequency (TF) Matrix for Single Document:
{0: 0.009259259259259259, 1: 0.009259259259259259, 2: 0.018518518518518517, 3: 0.009259259259259259, 4: 0.009259259259259259, 5: 0.018518518518518517, 6: 0.018518518518518517, 7: 0.009259259259259259, 8: 0.009259259259259259, 9: 0.018518518518518517, 10: 0.009259259259259259, 11: 0.009259259259259259, 12: 0.009259259259259259, 13: 0.009259259259259259, 14: 0.009259259259259259, 15: 0.009259259259259259, 16: 0.018518518518518517, 17: 0.009259259259259259, 18: 0.009259259259259259, 19: 0.009259259259259259, 20: 0.009259259259259259, 21: 0.06481481481481481, 22: 0.009259259259259259, 23: 0.018518518518518517, 24: 0.009259259259259259, 25: 0.009259259259259259, 26: 0.009259259259259259, 27: 0.009259259259259259, 28: 0.009259259259259259, 29: 0.009259259259259259, 30: 0.009259259259259259, 31: 0.009259259259259259, 32: 0.009259259259259259, 33: 0.009259259259259259, 34: 0.009259259259259259, 35: 0.009259259259259259, 36: 0.009259259259259259, 

In [97]:
# Display only the first two documents in the TF matrix
print("\nTerm Frequency (TF) Matrix for First Two Documents:\n")
for doc_id, doc_tf in enumerate(tf_matrix[:2], start=1):
    print(f"Document {doc_id}:")
    for word_id, freq in enumerate(doc_tf):
          # Skip printing zero frequencies
            print(f"({word_id}, {freq})")
    print()



Term Frequency (TF) Matrix for First Two Documents:

Document 1:
(0, 0)
(1, 1)
(2, 2)
(3, 3)
(4, 4)
(5, 5)
(6, 6)
(7, 7)
(8, 8)
(9, 9)
(10, 10)
(11, 11)
(12, 12)
(13, 13)
(14, 14)
(15, 15)
(16, 16)
(17, 17)
(18, 18)
(19, 19)
(20, 20)
(21, 21)
(22, 22)
(23, 23)
(24, 24)
(25, 25)
(26, 26)
(27, 27)
(28, 28)
(29, 29)
(30, 30)
(31, 31)
(32, 32)
(33, 33)
(34, 34)
(35, 35)
(36, 36)
(37, 37)
(38, 38)
(39, 39)
(40, 40)
(41, 41)
(42, 42)
(43, 43)
(44, 44)
(45, 45)
(46, 46)
(47, 47)
(48, 48)
(49, 49)
(50, 50)
(51, 51)
(52, 52)
(53, 53)
(54, 54)
(55, 55)
(56, 56)
(57, 57)
(58, 58)
(59, 59)
(60, 60)
(61, 61)
(62, 62)
(63, 63)
(64, 64)
(65, 65)
(66, 66)
(67, 67)
(68, 68)
(69, 69)
(70, 70)
(71, 71)
(72, 72)
(73, 73)
(74, 74)
(75, 75)
(76, 76)
(77, 77)
(78, 78)
(79, 79)
(80, 80)
(81, 81)
(82, 82)
(83, 83)
(84, 84)
(85, 85)
(86, 86)
(87, 87)
(88, 88)
(89, 89)
(90, 90)
(91, 91)
(92, 92)

Document 2:
(0, 93)
(1, 94)
(2, 95)
(3, 96)
(4, 97)
(5, 98)
(6, 99)
(7, 100)
(8, 101)
(9, 102)
(10, 103)
(11, 104)
(

In [98]:
# import nltk
# from nltk.corpus import stopwords
# from collections import Counter

# # Assuming df1 is your DataFrame containing multiple documents

# # Display only the first two documents in the TF matrix
# print("\nTerm Frequency (TF) Matrix for First Two Documents:\n")

# # Initialize TF matrix to store TF vectors for all documents
# tf_matrix = []

# for text in df1['SECTION_TEXT'][:2]:  # Consider only the first two documents
#     words = text.split()  # Split text into words
#     word_counts = Counter(word for word in words if word.lower() not in stop_words)
#     total_words = sum(word_counts.values())
    
#     # Initialize document TF vector as a dictionary
#     doc_tf = {}
    
#     # Calculate TF for each term
#     for word, count in word_counts.items():
#         word_id = vocabulary.get(word.lower(), -1)  # Get word ID from vocabulary
#         if word_id != -1:  # If word is in the vocabulary
#             term_frequency = count / total_words  # Calculate TF
#             doc_tf[word_id] = term_frequency  # Store TF in the document TF vector
    
#     # Append document TF vector to TF matrix
#     tf_matrix.append(doc_tf)

# # Iterate over TF matrix to display TF vectors for the first two documents
# for doc_id, doc_tf in enumerate(tf_matrix, start=1):
#     print(f"Document {doc_id}:")
#     for word_id, freq in doc_tf.items():
#         print(f"({word_id}, {freq})")
#     print()
