# Create Wiki Data!
## Download the wikidumps from: https://dumps.wikimedia.org/
## We downloaded wiki version: enwiki-20230820-pages-articles-multistream.xml.bz2
## Access the exact wiki files we used via this link: https://drive.google.com/drive/folders/1hwUhcqI0I8k9x5t15XbIaWMKhm7d6NV3

In [None]:
import os
import pandas as pd

# Path to the folder containing wiki files (we converted them to text files)
folder_path = 'path to your wiki downloads.'


# Initialize a list to store extracted data
data = []

# Iterate through the files in the folder
for filename in sorted(os.listdir(folder_path)):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        
        # Initialize variables to store data
        title = ""
        categories = ""
        content = ""
        is_in_content = False
        
        # Iterate through the lines and extract data
        for line in lines:
            line = line.strip()
            if line.startswith("[[") and line.endswith("]]"):
                if title:
                    data.append({'title': title, 'categories': categories, 'content': content})
                    title = ""
                    categories = ""
                    content = ""
                title = line[2:-2]
            elif line.startswith("CATEGORIES:"):
                categories = line.split(":")[1].strip()
            elif line.startswith("==") and line.endswith("=="):
                is_in_content = not is_in_content
            elif not is_in_content:
                content += line + " "
            elif is_in_content:
                content += line + " "
        
        # Append the last entry
        data.append({'title': title, 'categories': categories, 'content': content})



In [4]:
# Create a DataFrame
df = pd.DataFrame(data)
df

In [6]:
# Preprocess the DataFrame
def preprocess_dataframe(df):
    # Remove duplicates
    df = df.drop_duplicates(subset=['title', 'categories', 'content'])
    
    # Remove rows with empty categories
    df = df[df['categories'].apply(len) > 0]
    
    # Remove rows with sentence length less than 20
    df = df[df['content'].apply(lambda x: len(x.split()) >= 20)]
    df =df.reset_index(drop=True)
    
    return df

preprocessed_df = preprocess_dataframe(df)


In [7]:
preprocessed_df

Unnamed: 0,title,categories,content
0,Anarchism,"Anarchism, Anti-capitalism, Anti-fascism, Econ...",Anarchism is a political philosophy and move...
1,Albedo,"Land surface effects on climate, Climate chang...",Albedo (æ; la) is the fraction of sunlight t...
2,A,"ISO basic Latin letters, Vowel letters","A, or a, is the first letter and the first v..."
3,Alabama,"Alabama, 1819 establishments in the United Sta...","Alabama (,) is a state in the Southeastern r..."
4,Achilles,"Greek mythological heroes, Kings of the Myrmid...","In Greek mythology, Achilles (ə ə) or Achill..."
...,...,...,...
5848885,Atrå,"Tinn, Villages in Vestfold og Telemark",Atrå is a village in Tinn Municipality in Ve...
5848886,Texas's 32nd House of Representatives district,Texas House of Representatives districts,District 32 is a district in the Texas House...
5848887,Artifact (app),"Android (operating system) software, IOS softw...",Artifact is a personalized news app that mak...
5848888,George Fortescue (MP),"1791 births, 1877 deaths, Younger sons of earl...",George Matthew Fortescue (21 May 1791 – 24 J...


In [14]:
preprocessed_df[preprocessed_df['title'] == 'Christianity'].categories.iloc[0]

'Christianity, 1st-century establishments, 1st-century introductions, Abrahamic religions, Monotheistic religions, Western culture'

In [16]:
#create doc ids for each document in the dataframe
list = []

for i in range(1, preprocessed_df.shape[0] + 1): # gets the length of the DataFrame.
    list.append(f'doc{i:02d}') # Using f-string for format and 02d for leading zeros.

preprocessed_df['docno'] = list

preprocessed_df.head()

Unnamed: 0,title,categories,content,docno
0,Anarchism,"Anarchism, Anti-capitalism, Anti-fascism, Econ...",Anarchism is a political philosophy and move...,doc01
1,Albedo,"Land surface effects on climate, Climate chang...",Albedo (æ; la) is the fraction of sunlight t...,doc02
2,A,"ISO basic Latin letters, Vowel letters","A, or a, is the first letter and the first v...",doc03
3,Alabama,"Alabama, 1819 establishments in the United Sta...","Alabama (,) is a state in the Southeastern r...",doc04
4,Achilles,"Greek mythological heroes, Kings of the Myrmid...","In Greek mythology, Achilles (ə ə) or Achill...",doc05


In [17]:
preprocessed_df.columns = ['title',"categories","text","docno"]

In [18]:
preprocessed_df

Unnamed: 0,title,categories,text,docno
0,Anarchism,"Anarchism, Anti-capitalism, Anti-fascism, Econ...",Anarchism is a political philosophy and move...,doc01
1,Albedo,"Land surface effects on climate, Climate chang...",Albedo (æ; la) is the fraction of sunlight t...,doc02
2,A,"ISO basic Latin letters, Vowel letters","A, or a, is the first letter and the first v...",doc03
3,Alabama,"Alabama, 1819 establishments in the United Sta...","Alabama (,) is a state in the Southeastern r...",doc04
4,Achilles,"Greek mythological heroes, Kings of the Myrmid...","In Greek mythology, Achilles (ə ə) or Achill...",doc05
...,...,...,...,...
5848885,Atrå,"Tinn, Villages in Vestfold og Telemark",Atrå is a village in Tinn Municipality in Ve...,doc5848886
5848886,Texas's 32nd House of Representatives district,Texas House of Representatives districts,District 32 is a district in the Texas House...,doc5848887
5848887,Artifact (app),"Android (operating system) software, IOS softw...",Artifact is a personalized news app that mak...,doc5848888
5848888,George Fortescue (MP),"1791 births, 1877 deaths, Younger sons of earl...",George Matthew Fortescue (21 May 1791 – 24 J...,doc5848889


In [20]:
# Convert the string representations to actual lists
preprocessed_df['categories'] = preprocessed_df['categories'].apply(lambda x: [x])

In [21]:
preprocessed_df.to_csv("", index=False)

# Next step is to Build index and Refromulate Queries