In [3]:
import pandas as pd

from data_preparation import _load_dataset, _prepare_data

data_path = "../data/train.csv"

df = _load_dataset(data_path)
df = _prepare_data(df)

In [9]:
import pandas as pd
from model_preparation import set_seed
from data_preparation import _load_dataset, _prepare_data
from preprocessor import Preprocessor


data_path = "../data/train.csv"
model_name = 'microsoft/deberta-base'
batch_size = 16
create_validation_set = False
SEED = 42

set_seed(SEED)



def custom_clean_text(text: str) -> str:

    pipeline = ['hyperlinks', 'mentions', 'hashtags', 'retweet', 'repetitions', 'emojis', 'smileys', 'spaces']

    preprocessor = Preprocessor(pipeline)

    return preprocessor(text)


df = _load_dataset(data_path)

df = df.iloc[:100]

final_df = df["tweet"].copy()

df = _prepare_data(df, custom_clean_text)

df = pd.concat([final_df, df[["text","label"]]], axis = 1)

pd.set_option('display.max_colwidth', None)
# df.head(10)

In [10]:
from tqdm.auto import tqdm
import pkg_resources
from symspellpy import SymSpell

def correct_spell(df: pd.DataFrame, attribute='text'):
    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    # term_index is the column of the term and count_index is the
    # column of the term frequency
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
    sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

    def check_spell(input_term):
        # lookup suggestions for multi-word input strings (supports compound
        # splitting & merging)
        # max edit distance per lookup (per single word, not per whole input string)
        suggestions = sym_spell.lookup_compound(input_term, max_edit_distance=2)
        # display suggestion term, edit distance, and term frequency
        progress_bar.update(1)
        return suggestions[0]._term

    progress_bar = tqdm(range(len(df[attribute])))
    df[attribute+"_spell"] = df[attribute].map(check_spell)

    return df

df = correct_spell(df = df, attribute = 'text')
df.head(20)

100%|██████████| 100/100 [00:00<00:00, 252.48it/s]


Unnamed: 0,tweet,text,label,text_spell
0,Puff puff pass ya hoe,Puff puff pass ya hoe,1,puff puff pass a hoe
1,@jthizzle when was this,mention when was this,0,mention when was this
2,TGIM (Thank God Its Monday) baby! 😀😀😀,TGIM (Thank God Its Monday) baby! emoji,1,tim thank god its monday baby emo i
3,Peace Love Shea http://t.co/SVG2yzx8L0,Peace Love Shea url,0,peace love shea urls
4,Same. https://t.co/GmP7Kz35yK,Same. url,0,same urls
5,"""OTAKU: Loving someone who doesn't exist.\nKPOP fans: Loving someone who doesn't know you exist.""","""OTAKU: Loving someone who doesn't exist. KPOP fans: Loving someone who doesn't know you exist.""",1,take loving someone who doesn't exist pop fans loving someone who doesn't know you exist
6,@ThatsMy_SON lol shutup but hmj,mention lol shutup but hmj,1,mention low shut up but hmm
7,RT @brightongigs: TODAY: KYLMA SOTA (Fin) / HATE FUCK / FEROXITY in Cowley Club http://t.co/ZfuF57jVWW,retweet mention : TODAY: KYLMA SOTA (Fin) / HATE FUCK / FEROXITY in Cowley Club url,1,re tweet mention today karma soma fin hate fuck ferocity in cowley club urls
8,RT @Ashton5SOS: I like this photo of us :) http://t.co/vF9nWS9NAx,retweet mention : I like this photo of us smiley url,1,re tweet mention i like this photo of us smiley urls
9,Can't find shit on TV.,Can't find shit on TV.,1,can't find shit on to


 52%|█████▏    | 15375/29543 [01:12<00:52, 271.02it/s]

### Stopwords with nltk (but we can use spacy) + punctiation (this is not needed I think)

In [5]:
import nltk
nltk.download('stopwords', download_dir='./')

[nltk_data] Downloading package stopwords to ./...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
from nltk.corpus import stopwords
import nltk
import string

nltk.data.path.append("./")
stopwords = stopwords.words('english')

def clean_puntuation_stopwords(text):
  text = ''.join([word for word in text if word not in string.punctuation])
  text = text.lower()
  text = ' '.join([word for word in text.split() if word not in stopwords])
  return text

df['text'] = df['text'].apply(clean_puntuation_stopwords)
df.head(20)

Unnamed: 0,text,label,text_spell
0,puff puff pass ya hoe,1,puff puff pass a hoe
1,,0,when was this
2,tgim thank god monday baby 😀😀😀,1,tim thank god its monday baby
3,peace love shea,0,peace love shea
4,,0,same
5,otaku loving someone doesnt exist kpop fans lo...,1,take loving someone who doesn't exist pop fans...
6,lol shutup hmj,1,low shut up but hmm
7,rt today kylma sota fin hate fuck feroxity cow...,1,it today karma soma fin hate fuck ferocity in ...
8,rt like photo us,1,it i like this photo of us
9,cant find shit tv,1,can't find shit on to
