### Jigsaw pre-processing
The process of converting data to something a computer can understand is referred to as pre-processing. One of the major forms of pre-processing is to filter out useless data.

- by visualizing the data  https://www.kaggle.com/andrej0marinchenko/jigsaw-data-vizualization-for-beginnersv
- analyzed the result https://www.kaggle.com/c/jigsaw-toxic-severity-rating/discussion/294164
- concluded that it was necessary to pre-process the text before toxicity analysis

In [None]:
import pandas as pd  # data analysis library
import numpy as np  # comprehensive mathematical functions, random number generators, linear algebra routines, Fourier transforms, and more

#nlp
import string  # working with string constants
import re  # regular expressions
import nltk  # Natural Language Toolkit
# NLTK is a leading platform for building Python programs to work with human language data. It provides easy-to-use interfaces to over 50 corpora and lexical resources such as WordNet, 
# along with a suite of text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning, wrappers for industrial-strength NLP libraries, 
# and an active discussion forum.
from nltk.corpus import stopwords  # In natural language processing, useless words (data), are referred to as stop words. 


# import matplotlib.pyplot as plt  # provides an implicit way of plotting
# import seaborn as sns  # for visualization
# from tqdm import tqdm  # progressbar decorator for iterators
# import os  # for operating system

# from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator  # word cloud building library

# import warnings  # error processing
# warnings.filterwarnings("ignore")

# from collections import defaultdict  # if the key is not found in the method, then a new entry is created instead of KeyError. The type of this new entry is specified by the defaultdict argument.

# from itertools import cycle  # contains some inbuilt functions for generating sequences using iterators
# plt.style.use('ggplot')
# color_pal = plt.rcParams['axes.prop_cycle'].by_key()['color']
# color_cycle = cycle(plt.rcParams['axes.prop_cycle'].by_key()['color'])


pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)
pd.set_option('max_rows', None)

In [None]:
# Look at the data names and size
!ls -Flash --color ../input/jigsaw-toxic-severity-rating/

In [None]:
val = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')
print(f'Validation Data csv is of shape: {val.shape}')

In [None]:
val.head(10)

### Clean Text

-  \ d - matches any one digit and replaces the expression [0-9];
-  \ D - excludes all digits and replaces [^ 0-9];
-  \ w - replaces any number, letter, or underscore;
-  \ W - any character except Latin, numbers or underscore;
-  \ s - matches any whitespace character;
-  \ S - describes any non-whitespace character.

-  "."    аny single character except newline \ n.
-  "?"    0 or 1 occurrence of the pattern to the left
-  "+"    1 or more occurrences of the pattern on the left
-  "*"    0 or more occurrences of the pattern on the left
-  "\w"   Any number or letter (\ W - everything except letter or number)
-  "\d"   Any digit [0-9] (\ D - everything except a digit)
-  "\s"   Any whitespace character (\ S is any non-whitespace character)
-  "\b"   Word boundary
-  "[..]" One of the characters in brackets ([^ ..] - any character except those in brackets)
-  "\"    Escaping special characters (\. Stands for period or \ + for plus sign)
-  "^ and $"      Beginning and end of line respectively
-  "{n, m}"       n to m occurrences ({, m} - 0 to m)
-  "a | b"        Matches a or b
-  "()"           Groups the expression and returns the found text
-  "\t, \n, \r"    Tab, newline, and carriage return characters respectively

In [None]:
import nltk
from nltk.corpus import stopwords
# stop = stopwords.words('english')


def clean_text(text):
    text = text.lower()  # convert to lower case
    
    # remove unnecessary characters and words
    text = text.replace('\n', ' ')
    text = text.replace('(\xa0)', ' ')
    text = text.replace('(&lt)', '')
    text = text.replace('(&gt)', '')
    text = text.replace("\\", "")
    
    # process links to sites
    text = text.replace('https?://\S+|www\.\S+', ' social medium ')   # with or without(http),://, one or more non-white space character, OR www, .,one or more non-white space character
    text = text.replace("http://", "")
    text = text.replace("www.", "")
    text = text.replace("https://", "")
    text = text.replace("wikipedia.org", "")
    
    # Replace symbols
    text = text.replace("$", "s") 
    text = text.replace("@", "a")    
    text = text.replace("!", " ! ")
    text = text.replace("?", " ? ")    
    

    # Replace character combinations
    text = re.sub(r'<[^<]+?>', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'<[^<]+?>', '', text) 

#     text = re.sub("[0-9]", '', text)  # remove all numbers
# a lot of uncensored words are written using numbers, simply deleting which we will lose important information
# '4r5e':'arse', 'ar5e':'arse', '5h1t':'shit', '5hit': 'shit', 'a55':'ass', b!tch, 'c0ck' d1ck f_u_c_k, l3i+ch, l3itch, ma5terb8 #OFC - Of fuckin course
    # text = re.sub('\d', ' ', text)  # remove all numbers
    # text = re.sub("\D", '', text)  # delete everything except the number
    text = text.replace("0", "o")
    text = text.replace("1", "i") 
    text = text.replace("2", "l") 
    text = text.replace("3", "e")
    text = text.replace("4", "a") 
    text = text.replace("5", "s") 
    text = text.replace("6", "g")
    text = text.replace("7", "t") 
#     text = text.replace("8", "ate")
    text = text.replace("9", "g")
  
    text = text.replace("+", "t") 
    text = text.replace("!", "i") 
    text = text.replace("|", "i")   
    
    text = text.replace("f ck", " fuck ")   
    text = text.replace("i88", " it is ok ")  
    text = text.replace("i8", " ok ")  
    text = text.replace("f ck", " fuck ")  
    text = text.replace("f ck", " fuck ")  
    text = text.replace("f ck", " fuck ")  
    text = text.replace("f ck", " fuck ")  
    text = text.replace("f ck", " fuck ")  
    text = text.replace("f ck", " fuck ")  
    text = text.replace("f ck", " fuck ")  
    text = text.replace("f ck", " fuck ")  
    text = text.replace("f ck", " fuck ")  
    text = text.replace("f ck", " fuck ")  
    text = text.replace("f ck", " fuck ")  

    text = text.replace("#ofc", " of fuckin course ")
    text = text.replace("fggt", " faggot ")
    text = text.replace("your", " your ")
    text = text.replace("self", " self ")
    text = text.replace("cuntbag", " cunt bag ")
    text = text.replace("fartchina", " fart china ")    
    text = text.replace("youi", " you i ")
    text = text.replace("cunti", " cunt i ")
    text = text.replace("sucki", " suck i ")
    text = text.replace("pagedelete", " page delete ")
    text = text.replace("cuntsi", " cuntsi ")
    text = text.replace("i'm", " i am ")
    text = text.replace("offuck", " of fuck ")
    text = text.replace("centraliststupid", " central ist stupid ")
    text = text.replace("hitleri", " hitler i ")
    text = text.replace("i've", " i have ")
    text = text.replace("i'll", " sick ")
    text = text.replace("fuck", " fuck ")
    text = text.replace("f u c k", " fuck ")
    text = text.replace("shit", " shit ")
    text = text.replace("bunksteve", " bunk steve ")
    text = text.replace('wikipedia', ' social medium ')
    text = text.replace("faggot", " faggot ")
    text = text.replace("delanoy", " delanoy ")
    text = text.replace("jewish", " jewish ")
    text = text.replace("sexsex", " sex ")
    text = text.replace("allii", " all ii ")
    text = text.replace("i'd", " i had ")
    text = text.replace("'s", " is ")
    text = text.replace("youbollocks", " you bollocks ")
    text = text.replace("dick", " dick ")
    text = text.replace("cuntsi", " cuntsi ")
    text = text.replace("mothjer", " mother ")
    text = text.replace("cuntfranks", " cunt ")
    text = text.replace("ullmann", " jewish ")
    text = text.replace("mr.", " mister ")
    text = text.replace("aidsaids", " aids ")
    text = text.replace("njgw", " nigger ")
    text = text.replace("wiki", " social medium ")
    text = text.replace("administrator", " admin ")
    text = text.replace("gamaliel", " jewish ")
    text = text.replace("rvv", " vanadalism ")
    text = text.replace("admins", " admin ")
    text = text.replace("pensnsnniensnsn", " penis ")
    text = text.replace("pneis", " penis ")
    text = text.replace("pennnis", " penis ")
    text = text.replace("pov.", " point of view ")
    text = text.replace("vandalising", " vandalism ")
    text = text.replace("cock", " dick ")
    text = text.replace("asshole", " asshole ")
    text = text.replace("youi", " you ")
    text = text.replace("afd", " all fucking day ")
    text = text.replace("sockpuppets", " sockpuppetry ")
    text = text.replace("iiprick", " iprick ")
    text = text.replace("penisi", " penis ")
    text = text.replace("warrior", " warrior ")
    text = text.replace("loil", " laughing out insanely loud ")
    text = text.replace("vandalise", " vanadalism ")
    text = text.replace("helli", " helli ")
    text = text.replace("lunchablesi", " lunchablesi ")
    text = text.replace("special", " special ")
    text = text.replace("ilol", " i lol ")
   
    
    
    
    text = re.sub(r'\b[uU]\b', 'you', text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    

    # text = re.sub('\W', ' ', text)  # will remove any character except Latin, numbers or underscore;
    # text = re.sub('\w', ' ', text) # this entry will remove all letters - it will help you see how many punctuation marks and other symbols are used
    text = re.sub('\s+', ' ', text)  # will remove more than one whitespace character
    # text = re.sub('\S+', ' ', text)  # will remove everything except the whitespace character.
    
    text = re.sub(r'\b([^\W\d_]+)(\s+\1)+\b', r'\1', re.sub(r'\W+', ' ', text).strip(), flags=re.I)  # remove repeating words coming immediately one after another
    # text = re.sub(r'([a-z])\2+', r'\1', text)  # remove all repeating characters going one by (more than two letters)

#     text = re.sub(r'(.)\1+', r'\1', text)  # remove all repeated characters one by one (more than two characters)
#     text = re.sub(r'(\w)\1(\1+)',r'\1',text)
#     text = re.sub(r"(\w)\1{2,}", r"(\w)\1{2}", text) 
    text = re.sub(r'(.)\1+', r'\1\1', text) # 2 or more characters are replaced by 2 characters
    
    text = re.sub(r'((\b\w+\b.{1,2}\w+\b)+).+\1', r'\1', text, flags = re.I)
    
    # Regular expression        r '((\ b \ w + \ b. {1,2} \ w + \ b) +). + \ 1'
    # finds each occurrence of multiple sequences of alphanumeric characters separated by one or two [any characters] 
    # (to cover the case where words are separated not only by a space, but possibly a period or comma and a space), 
    # and then repeats after doing some [any character ] of indeterminate length. Then
    
    # re.sub(r '((\ b \ w + \ b. {1,2} \ w + \ b) +). + \ 1', r '\ 1', s, flags = re.I)
    # replaces such occurrences with the first multiple set of alphanumeric characters, separated by one or two [any character], 
    # be sure to ignore case (since a repeated phrase can sometimes appear at the beginning of a sentence).

    
    text = re.sub("[:|♣|'|§|♠|*|/|?|=|%|&|-|#|•|~|^|>|<|►|_]", '', text)
    text = text.strip(' ')  # will remove spaces at the beginning and end of the line

    
#     print(stopwords.words('english'))  # view all stop words
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')  # formula
    text = pattern.sub('', text)  # dell oll stop words
        
    
    return text

    
#     in process

In [None]:
val['less_toxic'] = val['less_toxic'].apply(clean_text)
val['more_toxic'] = val['more_toxic'].apply(clean_text)


In [None]:
# val[0:10]
val[10:20]

In [None]:
# import modules Lemmatizer
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()
 
# print("rocks :", lemmatizer.lemmatize("rocks"))
# print("corpora :", lemmatizer.lemmatize("corpora"))
 
# # a denotes adjective in "pos"
# print("better :", lemmatizer.lemmatize("better", pos ="a"))


# lemmatizer = nltk.stem.WordNetLemmatizer()  # Lemmatize using WordNet’s built-in morphy function. Returns the input word unchanged if it cannot be found in WordNet.
# Lemmatization is the process of grouping together the different inflected forms of a word so they can be analyzed as a single item. Lemmatization is similar to stemming but it brings context
# to the words. So it links words with similar meanings to one word. 
#Text preprocessing includes both Stemming as well as Lemmatization. Many times people find these two terms confusing. Some treat these two as the same. Actually, lemmatization is preferred over
# Stemming because lemmatization does morphological analysis of the words.


def lemmatize_text(text):
    return lemmatizer.lemmatize(text)



In [None]:
val['less_toxic'] = val['less_toxic'].apply(lemmatize_text)
val['more_toxic'] = val['more_toxic'].apply(lemmatize_text)
val.head(10)

# output list

In [None]:

# import these modules
from nltk.stem import PorterStemmer  
# Stemming is the process of producing morphological variants of a root / base word. Stemming programs are commonly referred to as stemming algorithms or stemmers. 
# A stemming algorithm reduces the words “chocolates”, “chocolatey”, “choco” to the root word, “chocolate” and “retrieval”, “retrieved”, “retrieves” reduce to the stem “retrieve”.

  
ps = PorterStemmer()
     
    
def stemmerize_text(text):   
    return ps.stem(text)


In [None]:
val['less_toxic'] = val['less_toxic'].apply(stemmerize_text)
val['more_toxic'] = val['more_toxic'].apply(stemmerize_text)
val.head(10)

### step2

In [None]:
def clean_text(text):
    text = text.lower()  # convert to lower case
    
    # remove unnecessary characters and words
    text = text.replace(' wo wo', ' woo woo ')
    text = text.replace('numbskul', ' numbskull ')
    
    
       
    
    return text

