In [None]:
import pandas as pd

from model_preparation import set_seed
from data_preparation import _load_dataset, _prepare_data


data_path = "../data/train.csv"
model_name = 'microsoft/deberta-base'
batch_size = 16
create_validation_set = False
SEED = 42

set_seed(SEED)


df = _load_dataset(data_path)

df = df.iloc[:100]

final_df = df["tweet"].copy()

pipeline = ['hyperlinks', 'mentions', 'hashtags', 'retweet', 'repetitions', 'emojis', 'smileys', 'spaces']
df = _prepare_data(df, pipeline)

df = pd.concat([final_df, df[["text","label"]]], axis = 1)

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
df.head(100)

# Check spell on a DataFrame

In [None]:
from tqdm.auto import tqdm
import pkg_resources
from symspellpy import SymSpell

def correct_spell(df: pd.DataFrame, attribute='text'):
    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    # term_index is the column of the term and count_index is the
    # column of the term frequency
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
    sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

    def check_spell(input_term):
        # lookup suggestions for multi-word input strings (supports compound
        # splitting & merging)
        # max edit distance per lookup (per single word, not per whole input string)
        suggestions = sym_spell.lookup_compound(input_term, max_edit_distance=2)
        # display suggestion term, edit distance, and term frequency
        progress_bar.update(1)
        return suggestions[0]._term

    progress_bar = tqdm(range(len(df[attribute])))
    df[attribute+"_spell"] = df[attribute].map(check_spell)

    return df

df = correct_spell(df = df, attribute = 'text')
df.head(20)

# Stopwords

### Stopwords with nltk (but we can use spacy) + punctiation (this is not needed I think)

In [None]:
import nltk
nltk.download('stopwords', download_dir='./')

In [None]:
from nltk.corpus import stopwords
import nltk
import string

nltk.data.path.append("./")
stopwords = stopwords.words('english')

def clean_puntuation_stopwords(text):
  text = ''.join([word for word in text if word not in string.punctuation])
  text = text.lower()
  text = ' '.join([word for word in text.split() if word not in stopwords])
  return text

df['text'] = df['text'].apply(clean_puntuation_stopwords)
df.head(20)