Preprocessing and Subset Creation 

In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import numpy as np
import math

In [15]:
#Taking 45000 rows approx 100 mb data from the original dataset 
csv_file_path = 'enwiki-20170820.csv'
num_rows_to_read = 990000
data_subset = pd.read_csv(csv_file_path, nrows=num_rows_to_read)

In [16]:
#Printing the columns names
for col in data_subset:
    print(col)

ARTICLE_ID
TITLE
SECTION_TITLE
SECTION_TEXT


In [17]:
columns_to_drop = ['TITLE','SECTION_TITLE']
data_subset = data_subset.drop(columns=columns_to_drop, axis=1)

In [18]:
data_subset['SECTION_TEXT'] = data_subset['SECTION_TEXT'].astype(str)
data_subset = data_subset.groupby('ARTICLE_ID')['SECTION_TEXT'].apply(lambda x: ' '.join(x)).reset_index()


In [19]:
data_subset['SECTION_TEXT'][0]

'\n\n\n\n\n\n\'\'\'Anarchism\'\'\' is a political philosophy that advocates self-governed societies based on voluntary institutions. These are often described as stateless societies although several authors have defined them more specifically as institutions based on non-hierarchical free associations. Anarchism holds the state to be undesirable unnecessary and harmful.\n\nWhile anti-statism is central anarchism specifically entails opposing authority or hierarchical organisation in the conduct of all human relations including but not limited to the state system.  Anarchism is usually considered an extreme left-wing ideology and much of anarchist economics and anarchist legal philosophy reflects anti-authoritarian interpretations of communism collectivism syndicalism mutualism or participatory economics.\n\nAnarchism does not offer a fixed body of doctrine from a single particular world view instead fluxing and flowing as a philosophy. Many types and traditions of anarchism exist not a

In [20]:
# Function for text preprocessing
def preprocess_text(text):
    # Remove newline characters
    text = text.replace('\n', '')

    # Lowercasing
    text = text.lower()

    # Removing special characters and punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenization
    tokens = word_tokenize(text)

    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    # Joining the lemmatized tokens back into a string
    preprocessed_text = ' '.join(lemmatized_tokens)

    return preprocessed_text

# Apply text preprocessing to the 'Section text' column
data_subset['SECTION_TEXT'] = data_subset['SECTION_TEXT'].apply(preprocess_text)


In [21]:
data_subset

Unnamed: 0,ARTICLE_ID,SECTION_TEXT
0,0,anarchism political philosophy advocate selfgo...
1,1,autism neurodevelopmental disorder characteriz...
2,2,percentage diffusely reflected sunlight relati...
3,3,writing cursive form aa named plural aes first...
4,4,alabama state southeastern region united state...
...,...,...
127209,127209,west bend town washington county wisconsin uni...
127210,127210,west bend county seat washington county wiscon...
127211,127211,brookfield town waukesha county wisconsin unit...
127212,127212,brookfield city located waukesha county wiscon...


In [None]:
def split_df_to_csv(df, output_prefix):
    total_rows = len(df)
    num_files = 8
    chunk_size = math.ceil(total_rows / num_files)
    for i in range(num_files):
        start_idx = i * chunk_size
        end_idx = min((i + 1) * chunk_size, total_rows)
        chunk_df = df.iloc[start_idx:end_idx]
        chunk_df.to_csv(f"{output_prefix}_{i+1}.csv", index=False)
        
output_prefix = "output_data"  # Prefix for output CSV files

split_df_to_csv(data_subset, output_prefix)

In [3]:
# Function to split CSV file into 2 parts
def split_csv_to_half(input_csv, output_prefix):
    # Read CSV file
    df = pd.read_csv(input_csv)

    # Calculate chunk size
    total_rows = len(df)
    chunk_size = math.ceil(total_rows / 2)

    # Split DataFrame into two parts and save each to a separate CSV file
    for i in range(2):
        start_idx = i * chunk_size
        end_idx = min((i + 1) * chunk_size, total_rows)
        chunk_df = df.iloc[start_idx:end_idx]
        chunk_df.to_csv(f"{output_prefix}_{i+1}.csv", index=False)

# Example usage
output_prefix = "output_data"  # Prefix for output CSV files

# Split each of the 8 CSV files into half
for i in range(1, 9):
    input_csv = f"{output_prefix}_{i}.csv"  # Input CSV file
    split_csv_to_half(input_csv, f"{output_prefix}_{i}")
