## We don't need no __$#!++Y__ data

## User generated text data can be __S#!~#@!!__

### A quick glance into some __common patterns__ where text is written in S#!~#@ way

- Pattern 1: characters repeated in the string like "brooooo", "cooooool", "damn uuuuuuuuuuuu", etc
- Pattern 2: words with punctuations in between - "f******r'
- Pattern 2b: words with punctuations at the end - "a**'
- Pattern 3: Non words multiple times sequentially
- Pattern 4: words are mixed together with punctuations like "word1/word2"
- Pattern 5: words split into characters with spaces like "F C K"
- Pattern 6: words split into characters with punctutations like "F-C-K"


In [None]:

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer

pd.options.display.max_colwidth=150
pd.options.display.min_rows=300

In [None]:
# Read the data 
df_test = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test.csv").sample(20000)
print(df_test.shape)

df = pd.read_csv('/kaggle/input/jigsaw-toxic-severity-rating/validation_data.csv')
print(df.shape)

# Combine text comments into one column
data = pd.DataFrame({"text": df_test.comment_text.tolist() + \
                             df.more_toxic.tolist() + \
                             df.less_toxic.tolist() }).drop_duplicates()
print(data.shape)


In [None]:
# Train TFIDF on old competition data and extract tokenizer

toxic_df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv')
tfidf_ = TfidfVectorizer(max_features = 5000).fit(toxic_df.comment_text)
print(len(tfidf_.vocabulary_))
tfidf_tokenizer = tfidf_.build_analyzer()


# Load bert tokenizer

bert_tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/distilbertbaseuncased/')

# Pattern: characters repeated in the string like "brooooo", "cooooool", "damn uuuuuuuuuuuu", etc


### Example of how different tokenizers will tokenize badly written strings - Information loss!

In [None]:
# An Example
for id_ in [66,622]:
    sample = data.text[id_]
    print("\n\n\n===== Actual comment =====")
    print(sample)

    print("\n===== Tfidf tokenized comment =====")
    print([x for x in tfidf_tokenizer(sample) if x in tfidf_.vocabulary_])

    print("\n===== Bert tokenized comment =====")
    print(bert_tokenizer.convert_ids_to_tokens(bert_tokenizer.encode(sample)))

In [None]:
# Extract such cases

tmp = data.text.str.extractall(r'([A-Za-z]+([A-Za-z])\2{2,}[A-Za-z]+\b)')

tmp.head(20)

In [None]:
# Top cases 
tmp[0].value_counts()

In [None]:

#data.text[data.text.str.contains(tmp[0].value_counts().index[2])].tolist()

### How will these look after cleaning 
#### Convert cases like "saaaaaad" to "sad"

In [None]:
# How will these look after cleaning 

# Take in the ids of the cases with patterns
idx = tmp.reset_index()['level_0'].drop_duplicates()
# Save in df
data_with_patterns = pd.DataFrame({"text": data.loc[idx].text.tolist()})
# Clean the pattern
data_with_patterns['cleaned'] = data_with_patterns.text.str.replace(r'([A-Za-z])\1{2,}',r'\1')
data_with_patterns.head(10)

# Pattern - words with punctuations in between - "f******r'

In [None]:
tmp = data.text.str.extractall(r'([A-Za-z]{1,}([*!?\'])\2{2,}[A-Za-z]{1,})')
tmp.head(20)


In [None]:
# Top patterns
tmp[0].value_counts()

### How will these look after cleaning 
#### Convert cases like "f***cking" or "f******cking" to "f*cking"

In [None]:

# Take in the ids of the cases with patterns
idx = tmp.reset_index()['level_0'].drop_duplicates()
# Save in df
data_with_patterns = pd.DataFrame({"text": data.loc[idx].text.tolist()})
# Clean the pattern
data_with_patterns['cleaned'] = data_with_patterns.text.str.replace(r'([A-Za-z]{1,})([*!?\'])\2{2,}([A-Za-z]{1,})',r'\1\2\3')

data_with_patterns.head(10)

# Pattern - words with punctuations at the end - "f***'

In [None]:
tmp = data.text.str.extractall(r'(\b[A-Za-z]{1,2}([\]*!?\'])\2{2,}\b)')
tmp.head(20)


In [None]:
tmp[0].value_counts()

# Pattern: Non words multiple times sequentially

In [None]:

tmp = data.text.str.extractall(r'([^\w ]{3,})')
tmp.head(20)


In [None]:
tmp[0].value_counts()

# Pattern - words are mixed together with punctuations like "word1/word2"

In [None]:

tmp = data.text.str.extractall(r'(([a-zA-Z]+)[/!?.]([a-zA-Z]+))').reset_index()
tmp.head()


In [None]:
# Top combinations
pd.concat([ tmp[1], tmp[2]]).str.lower().value_counts().reset_index()[:20]

# Pattern: words split into characters with spaces like "F C K"


In [None]:

tmp = data.text.str.extractall(r'(\b([a-zA-Z] ){3,})').reset_index()
tmp.head(20)


In [None]:
# Top patterns
tmp[0].value_counts()

In [None]:
tmp[0].str.lower().str.replace(r'[ .-]','').value_counts()

# Pattern: words split into characters with punctuations like "N-O-N-S-E-N-S-E"


In [None]:

tmp = data.text.str.extractall(r'(\b([a-zA-Z][-.]){2,}[a-zA-Z]\b)').reset_index()
tmp.head(20)


In [None]:
# Top patterns
tmp[0].value_counts()

In [None]:
# Cleaned distribution
tmp[0].str.lower().str.replace(r'[ .-]','').value_counts()

# pattern - \w mixed with \W

In [None]:

tmp = data.text.str.extractall(r'([a-zA-Z]+[^\w ]{3,})')
tmp.head(20)


In [None]:
tmp[0].value_counts()