In [39]:
import pandas as pd

df = pd.read_csv('./codeforce_raw_data.csv', index_col=0, encoding='utf8')
df.head()

Unnamed: 0,time_limit,memory_limit,input_file,output_file,description,tags,points,rating
1846/F,1 second,256 megabytes,standard,standard,This is an interactive task.Rudolph is a scien...,"['constructive algorithms', 'implementation', ...",,
1847/D,2 seconds,256 megabytes,standard,standard,Josuke is tired of his peaceful life in Morioh...,"['data structures', 'dsu', 'greedy', 'implemen...",2000.0,
1846/E2,2 seconds,256 megabytes,standard,standard,This is the hard version of the problem. The o...,"['binary search', 'brute force', 'data structu...",,
1846/E1,2 seconds,256 megabytes,standard,standard,This is a simple version of the problem. The o...,"['brute force', 'implementation', 'math']",,
1846/C,1 second,256 megabytes,standard,standard,Rudolf has registered for a programming compet...,"['constructive algorithms', 'greedy', 'impleme...",,


In [40]:
#!pip install nltk

In [41]:
import re

import nltk
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\juntae\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\juntae\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [42]:
# This is an interactive task.Rudolph is a scientist who studies alien life forms.
# -> This is an interactive task. Rudolph is a scientist who studies alien life forms.
def processing_dot_capitalize(sentences):
    new_sentences = "" 
    prev = sentences[0]
    for t in sentences:
        if prev == '.' and t.isupper():
            new_sentences += ' '
        new_sentences += t
        prev = t
    return new_sentences

# ( $$$ 1 \\le t_{i, j} \\le 10^6 $$$ )
# -> ( $$$ 1 \\le t_{i, j} \\le 1000000 $$$ )
def replace_exponent_notation(text):
    # Function to replace exponent notation with numbers
    def replace_exponent(match):
        exponent = int(match.group(1))
        return str(10 ** exponent)

    # Replace "10^4" or "10^5" with the corresponding numbers
    replaced_text = re.sub(r'10\^(\d+)', replace_exponent, text)

    return replaced_text

# there is a room in front of rudolph with $$$n$$$ different objects scattered around.
# -> There is a room in front of Rudolph with $$$ n $$$ different objects scattered around.
def add_spacing_between_dollar_signs(text):
    # Regular expression pattern to add spacing between "$$$"
    pattern = r'(?<=\$\$\$)(?=\S)|(?<=\S)(?=\$\$\$)'

    # Add spacing between "$$$"
    spaced_text = re.sub(pattern, ' ', text)

    return spaced_text

# This is an interactive task. 
# -> this is an interactive task.
def convert_to_lowercase(text):
    return text.lower()

def is_number(string):
    return string.isdigit()

# 2 \\cdot 100000 
# -> 200000
def calculate_multiplication(text):
    t = text.split()  # Split the input text into a list of words
    new_text = []  # Initialize a new list for the modified text
    count = 0  # Initialize a count to keep track of processed elements
    
    # Iterate through the words in the input text
    for i in range(len(t)):
        if count > 0:
            count -= 1
            continue
        
        # Check if the current word and the word after it form a multiplication expression
        if not len(t) - i < 3 and is_number(t[i]) and is_number(t[i + 2]) and t[i + 1] == '\cdot':
            # Evaluate and append the result of the multiplication to the new text
            new_text.append(str(eval(t[i] + '*' + t[i + 2])))
            count = 2  # Skip the next two words as they have been processed in the multiplication
        else:
            new_text.append(t[i])  # Append the current word to the new text
    
    return ' '.join(new_text)  # Join the modified words to form the final text

In [43]:
def preprocessing(text):
    #text = remove_less_than_three_letters(text)
    text = processing_dot_capitalize(text) # Run before "lowercase"
    text = convert_to_lowercase(text)
    text = add_spacing_between_dollar_signs(text)
    text = replace_exponent_notation(text)
    text = calculate_multiplication(text) # optional
    return text

In [44]:
def split_sentences(sentences):
    return sent_tokenize(sentences)

def split_words(sentence):
    return word_tokenize(sentence)

In [45]:
def lemmatization(tokens):
    # Initialize the WordNet Lemmatizer
    lmtzr = WordNetLemmatizer()
    
    # Lemmatize each word in the list of tokens as verbs ('v' indicates verb lemmatization)
    tokens = [lmtzr.lemmatize(word, 'v') for word in tokens]
    
    return tokens

def remove_stopwords(tokens):
    filtered_words = []  # Initialize a list to store filtered words
    stopwords = nltk.corpus.stopwords.words('english')  # Get the list of English stopwords
    stopwords = [item for item in stopwords if len(item) > 1]  # Filter out single-letter stopwords
    
    for word in tokens:
        # If the individual word is not in the stopwords list, add it to the filtered_words list
        if word not in stopwords:
            filtered_words.append(word)
    
    return filtered_words


In [46]:
def get_preprocessing_sentence(tokens):
    filtered_words = remove_stopwords(tokens)
    filtered_words = lemmatization(filtered_words)
    return ' '.join(filtered_words).replace('$ $ $', '$$$')

In [47]:
def get_preprocessed_sentence(sentences):
    new_sentences = []

    sentences = preprocessing(sentences)
    sentences_list = split_sentences(sentences)
    
    for sentence in sentences_list:
        tokens = split_words(sentence)
        preprocessed_sentence = get_preprocessing_sentence(tokens)
        if preprocessed_sentence[-1] == '.':
            preprocessed_sentence = preprocessed_sentence[:-2]
        new_sentences.append(preprocessed_sentence.replace(' , ', ' '))
    return new_sentences

In [48]:
df = df.dropna(subset=['description'])

In [49]:
from tqdm import tqdm

new_description = []
for description in tqdm(df['description'].values):
    new_description.append(get_preprocessed_sentence(description))

100%|██████████████████████████████████████████████████████████████████████████████| 7968/7968 [01:37<00:00, 81.79it/s]


In [None]:
df['description'] = new_description

In [None]:
df['description'] = df['description'].apply(list_to_string)
df.head()

In [50]:
def dollar_processing(arr):
    # Initialize a new array
    new_arr = []

    # Replace consecutive '$' with '$$$'
    i = 0
    while i < len(arr):
        if i + 2 < len(arr) and arr[i] == '$' and arr[i + 1] == '$' and arr[i + 2] == '$':
            new_arr.append('$$$')
            i += 3  # Process three '$' and increase the index by 3
        else:
            new_arr.append(arr[i])
            i += 1
    return new_arr

In [51]:
from collections import Counter, defaultdict
from torchtext.data.utils import get_tokenizer

def tokenizing_sentences(sentences):
    # Initialize tokenizer
    tokenizer = get_tokenizer("spacy")

    new_sentences = []
    for sentence in sentences:
        tokens = tokenizer(sentence)
        new_sentences.append(dollar_processing(tokens))
        
    return new_sentences

In [52]:
# Function to convert a list to a string
def list_to_string(lst):
    return ' '.join(lst)

In [55]:
df['description'] = tokenizing_sentences(df['description'])



In [56]:
df['description'] = df['description'].apply(list_to_string)
df.head()

Unnamed: 0,time_limit,memory_limit,input_file,output_file,description,tags,points,rating
1846/F,1 second,256 megabytes,standard,standard,interactive task rudolph a scientist study ali...,"['constructive algorithms', 'implementation', ...",,
1847/D,2 seconds,256 megabytes,standard,standard,josuke tire peaceful life morioh follow nephew...,"['data structures', 'dsu', 'greedy', 'implemen...",2000.0,
1846/E2,2 seconds,256 megabytes,standard,standard,hard version problem difference version $$$ n ...,"['binary search', 'brute force', 'data structu...",,
1846/E1,2 seconds,256 megabytes,standard,standard,a simple version problem difference version $$...,"['brute force', 'implementation', 'math']",,
1846/C,1 second,256 megabytes,standard,standard,rudolf register a program competition follow r...,"['constructive algorithms', 'greedy', 'impleme...",,


In [57]:
df.to_csv('codeforce_processed_data.csv')