Preprocessing and Subset Creation 

In [4]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import numpy as np
import math



In [5]:
#Taking 45000 rows approx 100 mb data from the original dataset 
csv_file_path = 'enwiki-20170820.csv'
num_rows_to_read = 100
data_subset = pd.read_csv(csv_file_path, nrows=num_rows_to_read)

In [6]:
#Printing the columns names
for col in data_subset:
    print(col)

ARTICLE_ID
TITLE
SECTION_TITLE
SECTION_TEXT


In [7]:
columns_to_drop = ['TITLE','SECTION_TITLE']
data_subset = data_subset.drop(columns=columns_to_drop, axis=1)

In [8]:
data_subset['SECTION_TEXT'] = data_subset['SECTION_TEXT'].astype(str)
data_subset = data_subset.groupby('ARTICLE_ID')['SECTION_TEXT'].apply(lambda x: ' '.join(x)).reset_index()


In [9]:
data_subset['SECTION_TEXT'][0]

'\n\n\n\n\n\n\'\'\'Anarchism\'\'\' is a political philosophy that advocates self-governed societies based on voluntary institutions. These are often described as stateless societies although several authors have defined them more specifically as institutions based on non-hierarchical free associations. Anarchism holds the state to be undesirable unnecessary and harmful.\n\nWhile anti-statism is central anarchism specifically entails opposing authority or hierarchical organisation in the conduct of all human relations including but not limited to the state system.  Anarchism is usually considered an extreme left-wing ideology and much of anarchist economics and anarchist legal philosophy reflects anti-authoritarian interpretations of communism collectivism syndicalism mutualism or participatory economics.\n\nAnarchism does not offer a fixed body of doctrine from a single particular world view instead fluxing and flowing as a philosophy. Many types and traditions of anarchism exist not a

In [10]:
# Function for text preprocessing
def preprocess_text(text):
    # Remove newline characters
    text = text.replace('\n', '')

    # Lowercasing
    text = text.lower()

    # Removing special characters and punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenization
    tokens = word_tokenize(text)

    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    # Joining the lemmatized tokens back into a string
    preprocessed_text = ' '.join(lemmatized_tokens)

    return preprocessed_text

# Apply text preprocessing to the 'Section text' column
data_subset['SECTION_TEXT'] = data_subset['SECTION_TEXT'].apply(preprocess_text)


In [11]:
data_subset

Unnamed: 0,ARTICLE_ID,SECTION_TEXT
0,0,anarchism political philosophy advocate selfgo...
1,1,autism neurodevelopmental disorder characteriz...
2,2,percentage diffusely reflected sunlight relati...
3,3,writing cursive form aa named plural aes first...
4,4,alabama state southeastern region united state...
5,5,achilles nereid cymothoe attic redfigure kanth...
6,6,abraham lincoln february 12 1809 – april 15 18...
7,7,aristotle aristotélēs 384–322 bc ancient greek...


In [12]:
# Create an empty dictionary to store word-to-id mappings
word_to_id = {}

# Create an empty list to store unique words
vocabulary = []

# Iterate over each row of the column
for text in data_subset['SECTION_TEXT']:
    # Split the text into individual words
    words = text.split()
    # Add each word to the vocabulary and assign a unique ID
    for word in words:
        if word not in word_to_id:
            word_to_id[word] = len(word_to_id)
            vocabulary.append(word)

# Print the vocabulary with assigned IDs
print("Vocabulary:")
for word, word_id in word_to_id.items():
    print(f"{word_id}: {word}")

# Print the word-to-id mappings
print("\nWord-to-ID mappings:")
print(word_to_id)

Vocabulary:
0: anarchism
1: political
2: philosophy
3: advocate
4: selfgoverned
5: society
6: based
7: voluntary
8: institution
9: often
10: described
11: stateless
12: although
13: several
14: author
15: defined
16: specifically
17: nonhierarchical
18: free
19: association
20: hold
21: state
22: undesirable
23: unnecessary
24: harmfulwhile
25: antistatism
26: central
27: entail
28: opposing
29: authority
30: hierarchical
31: organisation
32: conduct
33: human
34: relation
35: including
36: limited
37: system
38: usually
39: considered
40: extreme
41: leftwing
42: ideology
43: much
44: anarchist
45: economics
46: legal
47: reflects
48: antiauthoritarian
49: interpretation
50: communism
51: collectivism
52: syndicalism
53: mutualism
54: participatory
55: economicsanarchism
56: offer
57: fixed
58: body
59: doctrine
60: single
61: particular
62: world
63: view
64: instead
65: fluxing
66: flowing
67: many
68: type
69: tradition
70: exist
71: mutually
72: exclusive
73: school
74: thought
75

In [13]:
# Create a list of dictionaries to store the TF values
tf_values_list = []

# Iterate over each row of the DataFrame
for index, row in data_subset.iterrows():
    # Split the text into individual words
    words = row['SECTION_TEXT'].split()
    
    # Count the frequency of each word using the vocabulary
    word_freq = {}
    for word in words:
        if word in vocabulary:
            word_id = word_to_id[word]
            word_freq[word_id] = word_freq.get(word_id, 0) + 1
    
    # Add the TF values to the list
    tf_values = {'ARTICLE_ID': row['ARTICLE_ID']}
    tf_values.update(word_freq)
    tf_values_list.append(tf_values)

# Create DataFrame from the list of dictionaries
tf_df = pd.DataFrame(tf_values_list)

# Fill missing values with 0
tf_df.fillna(0, inplace=True)

# Print the TF DataFrame
tf_df

Unnamed: 0,ARTICLE_ID,0,1,2,3,4,5,6,7,8,...,11023,11024,11025,11026,11027,11028,11029,11030,11031,11032
0,0,102.0,21.0,13.0,8.0,1.0,23.0,11,5.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,1.0,7,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,7.0,0.0,0.0,0.0,1.0,10,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5,0.0,0.0,1.0,0.0,0.0,0.0,2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,6,0.0,30.0,1.0,1.0,0.0,1.0,6,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,7,0.0,5.0,22.0,0.0,0.0,1.0,5,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
