# Read and Explore Data

In [1]:
import os
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    
import numpy as np
import pandas as pd
import sklearn

# Libraries and packages for text (pre-)processing 
import string
import re
import nltk

print("Python version:", sys.version)
print("Version info.:", sys.version_info)
print("pandas version:", pd.__version__)
print("numpy version:", np.__version__)
print("skearn version:", sklearn.__version__)
print("re version:", re.__version__)
print("nltk version:", nltk.__version__)

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Python version: 3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]
Version info.: sys.version_info(major=3, minor=9, micro=13, releaselevel='final', serial=0)
pandas version: 1.4.4
numpy version: 1.21.5
skearn version: 1.2.2
re version: 2.2.1
nltk version: 3.7


# Read Data

In [2]:
# read the csv file
train_df = pd.read_csv(r'C:\Users\Pokie\Documents\Grad_School\273P_ML_Project\jigsaw-toxic-comment-train.csv')
display(train_df.shape, train_df.head())

(223549, 8)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


# Clean Data

### 1. Remove Capitalization

Because of the variety of capitalization used to construct a sentence, capitalization or lower case is the strategy used in text cleaning most frequently. With this method, every word in the text and document will be projected into the same feature area. If mistakes, slang, acronyms, or informal abbreviations were to be replaced, the issue would only arise in rare instances like the USA or the UK.

In [3]:
train_df["text_clean"] = train_df["comment_text"].apply(lambda x: x.lower())
display(train_df.head())

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,text_clean
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation\nwhy the edits made under my usern...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,d'aww! he matches this background colour i'm s...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,"hey man, i'm really not trying to edit war. it..."
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,"""\nmore\ni can't make any real suggestions on ..."
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,"you, sir, are my hero. any chance you remember..."


### 2.Expanding Contractions

Example of contraction: We'll -> We will, or we shoudn't've -> we should not have. By using Contractions package to expand contractions in English

In [4]:
#added in this installation due to errors on my side
#!pip install contractions

In [5]:
import contractions

# Test
test_text = """
            Y'all can't expand contractions I'd think. I'd like to know how I'd done that! 
            We're going to the zoo and I don't think I'll be home for dinner.
            Theyre going to the zoo and she'll be home for dinner.
            We should've do it in here but we shouldn't've eat it
            """
print("Test: ", contractions.fix(test_text))

train_df["text_clean"] = train_df["text_clean"].apply(lambda x: contractions.fix(x))

Test:  
            You all cannot expand contractions I would think. I would like to know how I would done that! 
            We are going to the zoo and I do not think I will be home for dinner.
            They Are going to the zoo and she will be home for dinner.
            We should have do it in here but we should not have eat it
            


In [6]:
# double check
print(train_df["comment_text"][67])
print(train_df["text_clean"][67])

I went there around the same time he did, and that certainly was not the case at the time. Later on they stopped taking children from such a young age.
i went there around the same time he did, and that certainly was not the case at the time. later on they stopped taking children from such a young age.


In [7]:
print(train_df["comment_text"][12])
print(train_df["text_clean"][12])

Hey... what is it..
@ | talk .
What is it... an exclusive group of some WP TALIBANS...who are good at destroying, self-appointed purist who GANG UP any one who asks them questions abt their ANTI-SOCIAL and DESTRUCTIVE (non)-contribution at WP?

hey... what is it..
@ | talk .
what is it... an exclusive group of some wp talibans...who are good at destroying, self-appointed purist who gang up any one who asks them questions about their anti-social and destructive (non)-contribution at wp?



### 3. Noise Removal

Removing unnecessary characters or punctuation such as URLs, HTML tags, non-ASCII characters (American Standard Code for Information Interchange), or other special characters (symbols, emojis, and other grahic characters)

#### 3.1 Remove URL

In [8]:
def remove_URL(text):
    """
        Remove URLs from a sample string
    """
    return re.sub(r"https?://\S+|www\.\S+", "", text)

In [9]:
# remove urls from the text
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: remove_URL(x))

# double check
print(train_df["comment_text"][101])
print(train_df["text_clean"][101])

Check the following websites:

http://www.iranchamber.com/personalities/farabi/farabi.php
http://www.islam.org.br/%C2%A0al_farabi.htm
http://www.superbeyin.com/sohbet/sohbet.htm
check the following websites:






In [10]:
print(train_df["comment_text"][91])
print(train_df["text_clean"][91])

Transliteration of Russian place names
In writing about Moscow Metro for the Malayalam Wikipedia, we are finding it difficult to correctly transliterate the Russian place names. For example, do we pronounce Park Kultury as PAARK KALTTARI or PAARK KALCHCHARI (or perhaps something completely different)? Can somebody please help by transliterating the list given in https://ml.wikipedia.org/wiki/സംവാദം:മോസ്കോ_മെട്രോ. (I am not putting the list here as I don't want to clutter up this page.) Thanks
transliteration of russian place names
in writing about moscow metro for the malayalam wikipedia, we are finding it difficult to correctly transliterate the russian place names. for example, do we pronounce park kultury as paark kalttari or paark kalchchari (or perhaps something completely different)? can somebody please help by transliterating the list given in  (i am not putting the list here as i do not want to clutter up this page.) thanks


#### 3.2 Remove HTML Tags

In [11]:
def remove_html(text):
    """
        Remove the html in sample text
    """
    html = re.compile(r"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
    return re.sub(html, "", text)

In [12]:
# remove html from the text
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: remove_html(x))

# double check
print(train_df["comment_text"][117])
print(train_df["text_clean"][117])

Also see this if you cant trust Murkoth Ramunni
http://books.google.com/books?id=HHev0U1GfpEC&pg;=PA51&dq;=Thiyya+matrilineal&hl;=en&sa;=X&ei;=TlpPUd2aH8mWiQLgvIDgBA&ved;=0CDYQ6AEwAQ#v=onepage&q;=Thiyya%20matrilineal&f;=false
also see this if you cannot trust murkoth ramunni



In [13]:
print(train_df["comment_text"][213])
print(train_df["text_clean"][213])

"

Please see ref 1, 4 , 5 & 6 of this article, mentioned by Wikipedia for Youngest Patent holder of India. Moreover I mentioned about news & latest book about real life heroes, by providing notable & reliable reference. Do not you feel DNA group is reliable & notable reference as Its page exists on Wikipedia. Do you really believe that, what ever or who so ever provide information for this article is having bad intentions. In your words sock/meatpuppets. Did i asked/suggested anything from you to edit or write. I am a free man to send information & You are a free person to analyse it. Are you above Wikipedia? When ref 1, 4 , 5 & 6 of this article, mentioned by Wikipedia for Youngest Patent holder of India for the subject..Why you speak in bad words...Your words verbatim ""Any book or review which repeats the ""India's youngest patent holder and the youngest disabled patent holder in the world"" claim that Bhati and his supporters are pushing fails as a reliable source.""  How can you 

#### 3.3 Remove Non-ASCII

In [14]:
def remove_non_ascii(text):
    """
        Remove non-ASCII characters 
    """
    return re.sub(r'[^\x00-\x7f]',r'', text) # or ''.join([x for x in text if x in string.printable]) 

In [15]:
# remove non-ascii characters from the text
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: remove_non_ascii(x))

# double che
print(train_df["comment_text"][6011]) #notice Côte
print(train_df["text_clean"][6011])

"
No, we don't.  The name ""Ivory Coast"" contains only English-alphabet letters, and it would sort just fine in almost all cases where ""Côte d'Ivoire"" remains missorted, often appearing after Croatia, after Cyprus, after Czech Republic.  You can see this by looking at Category:Communications by country and seeing where the subcategory Category:Communications in Côte d'Ivoire is missorted (even though the sorting of the article of the same name has been fixed—can you see the difference in order?).  Or you can see it by looking at Category:Nations at the 2004 Summer Olympics and seeing where the article Côte d'Ivoire at the 2004 Summer Olympics is missorted.
There are hundreds of those problems out there.  Why don't you make yourself useful, and go fix some of them?   

"
"
no, we do not.  the name ""ivory coast"" contains only english-alphabet letters, and it would sort just fine in almost all cases where ""cte d'ivoire"" remains missorted, often appearing after croatia, after cyprus

In [16]:
print(train_df["comment_text"][7814]) #notice eôs
print(train_df["text_clean"][7814])

amazons, after all
I think I found the source of the connection of the labrys with the amazons: sagaris 

 saga^ris , eôs Ion. ios, hê; pl. sagareis Ion. -i_s:a weapon used by the Scythian tribes, Hdt.1.215, 4.5;

 A. axinas sagaris eichon Id.7.64 ; by the Amazons, Aristarch. in 
 PAmh.2.12 ii 10; by the Persians, Amazons, Mossynoeci, etc., 
 X.An.4.4.16, 5.4.13:acc. to Hsch. single-edged, and joined by 
 X. with kopis and machaira, Cyr. 1.2.9, 2.1.9, 4.2.22; 
 double-edged acc. to AP6.94 (Phil.).

i.e. an axe-like weapon, sometimes described as single-edged, and sometimes as double-edged. This shouild probably be put in a Sagaris article.
amazons, after all
i think i found the source of the connection of the labrys with the amazons: sagaris 

 saga^ris , es ion. ios, h; pl. sagareis ion. -i_s:a weapon used by the scythian tribes, hdt.1.215, 4.5;

 a. axinas sagaris eichon id.7.64 ; by the amazons, aristarch. in 
 pamh.2.12 ii 10; by the persians, amazons, mossynoeci, etc., 
 x.an.4.4.

#### 3.4 Remove Special Characters

Special characters could be symbols, emojis, and other graphic characters. 

In [17]:
def remove_special_characters(text):
    """
        Remove special special characters, including symbols, emojis, and other graphic characters
    """
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [18]:
train_df["text_clean"] = train_df["comment_text"].apply(lambda x: remove_special_characters(x))

# double check
print(train_df["comment_text"][143])
print(train_df["text_clean"][143])

"P.S. It's not polite to talk to people behind their backs, please remove your comments from Mrph's talk page.

Vaughan
You're right; I went to check your previous edit and found a page on the Marvel site that spelled it ""Vaughn"", but now I am finding many more that spell it correctly. Thanks for the edits.   (☎☓) 

"
"P.S. It's not polite to talk to people behind their backs, please remove your comments from Mrph's talk page.

Vaughan
You're right; I went to check your previous edit and found a page on the Marvel site that spelled it ""Vaughn"", but now I am finding many more that spell it correctly. Thanks for the edits.   () 

"


In [19]:
print(train_df["comment_text"][189])
print(train_df["text_clean"][189])

"

Sorry to interrupt but I'm at 1200 edits now... the first 200 were likely just on my own pages and because I was asking for help so much so maybe just 1000... or maybe less... but it still kind of counts. ♥♥Amulet♥♥ "
"

Sorry to interrupt but I'm at 1200 edits now... the first 200 were likely just on my own pages and because I was asking for help so much so maybe just 1000... or maybe less... but it still kind of counts. Amulet "


#### 3.5 Remove Punctuations

In [20]:
def remove_punct(text):
    """
        Remove the punctuation
    """
#     return re.sub(r'[]!"$%&\'()*+,./:;=#@?[\\^_`{|}~-]+', "", text)
    return text.translate(str.maketrans('', '', string.punctuation))

In [21]:
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: remove_punct(x))

# double check
print(train_df["comment_text"][3])
print(train_df["text_clean"][3])

"
More
I can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of ""types of accidents""  -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.

There appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport  "

More
I cant make any real suggestions on improvement  I wondered if the section statistics should be later on or a subsection of types of accidents  I think the references may need tidying so that they are all in the exact same format ie date format etc I can do that later on if noone else does first  if you have any preferences for formatting style on references or want to

In [22]:
print(train_df["comment_text"][7595])
print(train_df["text_clean"][7595])

"

I don't see a problem with citing GAO/HRD-87-46 and the pdf linked there. The ""internal FDA documents"" sound like they are by definition not published.  Talk "


I dont see a problem with citing GAOHRD8746 and the pdf linked there The internal FDA documents sound like they are by definition not published  Talk 


#### 3.6 Other Manual Text Cleaning Tasks

- Replace Unicode characters with equivalent ASCII character (instead of removing)
- Replace the entity references with their actual symbols instead of removing as HTML tags
- Replace the Typos, slang, acronyms or informal abbreviations - depends on diffrent situations or main topics of the NLP such as finance or medical topics. 
- List out all the hashtags/ usernames then replace with equivalent words. 
- Replace the emoticon/ emoji with equivalent word meaning such as ":)" with "smile" 
- Spelling correction

In [23]:
def other_clean(text):
        """
            Other manual text cleaning techniques
        """
        # Typos, slang and other
        sample_typos_slang = {
                                "w/e": "whatever",
                                "usagov": "usa government",
                                "recentlu": "recently",
                                "ph0tos": "photos",
                                "amirite": "am i right",
                                "exp0sed": "exposed",
                                "<3": "love",
                                "luv": "love",
                                "amageddon": "armageddon",
                                "trfc": "traffic",
                                "16yr": "16 year"
                                }

        # Acronyms
        sample_acronyms =  { 
                            "mh370": "malaysia airlines flight 370",
                            "okwx": "oklahoma city weather",
                            "arwx": "arkansas weather",    
                            "gawx": "georgia weather",  
                            "scwx": "south carolina weather",  
                            "cawx": "california weather",
                            "tnwx": "tennessee weather",
                            "azwx": "arizona weather",  
                            "alwx": "alabama weather",
                            "usnwsgov": "united states national weather service",
                            "2mw": "tomorrow"
                            }

        # Some common abbreviations 
        sample_abbr = {
                        "$" : " dollar ",
                        "€" : " euro ",
                        "4ao" : "for adults only",
                        "a.m" : "before midday",
                        "a3" : "anytime anywhere anyplace",
                        "aamof" : "as a matter of fact",
                        "acct" : "account",
                        "adih" : "another day in hell",
                        "afaic" : "as far as i am concerned",
                        "afaict" : "as far as i can tell",
                        "afaik" : "as far as i know",
                        "afair" : "as far as i remember",
                        "afk" : "away from keyboard",
                        "app" : "application",
                        "approx" : "approximately",
                        "apps" : "applications",
                        "asap" : "as soon as possible",
                        "atk" : "at the keyboard",
                        "ave." : "avenue",
                        "aymm" : "are you my mother",
                        "ayor" : "at your own risk", 
                        "b&b" : "bed and breakfast",
                        "b+b" : "bed and breakfast",
                        "b.c" : "before christ",
                        "b2b" : "business to business",
                        "b2c" : "business to customer",
                        "b4" : "before",
                        "b4n" : "bye for now",
                        "b@u" : "back at you",
                        "bae" : "before anyone else",
                        "bak" : "back at keyboard",
                        "bbbg" : "bye bye be good",
                        "bbc" : "british broadcasting corporation",
                        "bbias" : "be back in a second",
                        "bbl" : "be back later",
                        "bbs" : "be back soon",
                        "be4" : "before",
                        "bfn" : "bye for now",
                        "blvd" : "boulevard",
                        "bout" : "about",
                        "brb" : "be right back",
                        "bros" : "brothers",
                        "brt" : "be right there",
                        "bsaaw" : "big smile and a wink",
                        "btw" : "by the way",
                        "bwl" : "bursting with laughter",
                        "c/o" : "care of",
                        "cet" : "central european time",
                        "cf" : "compare",
                        "cia" : "central intelligence agency",
                        "csl" : "can not stop laughing",
                        "cu" : "see you",
                        "cul8r" : "see you later",
                        "cv" : "curriculum vitae",
                        "cwot" : "complete waste of time",
                        "cya" : "see you",
                        "cyt" : "see you tomorrow",
                        "dae" : "does anyone else",
                        "dbmib" : "do not bother me i am busy",
                        "diy" : "do it yourself",
                        "dm" : "direct message",
                        "dwh" : "during work hours",
                        "e123" : "easy as one two three",
                        "eet" : "eastern european time",
                        "eg" : "example",
                        "embm" : "early morning business meeting",
                        "encl" : "enclosed",
                        "encl." : "enclosed",
                        "etc" : "and so on",
                        "faq" : "frequently asked questions",
                        "fawc" : "for anyone who cares",
                        "fb" : "facebook",
                        "fc" : "fingers crossed",
                        "fig" : "figure",
                        "fimh" : "forever in my heart", 
                        "ft." : "feet",
                        "ft" : "featuring",
                        "ftl" : "for the loss",
                        "ftw" : "for the win",
                        "fwiw" : "for what it is worth",
                        "fyi" : "for your information",
                        "g9" : "genius",
                        "gahoy" : "get a hold of yourself",
                        "gal" : "get a life",
                        "gcse" : "general certificate of secondary education",
                        "gfn" : "gone for now",
                        "gg" : "good game",
                        "gl" : "good luck",
                        "glhf" : "good luck have fun",
                        "gmt" : "greenwich mean time",
                        "gmta" : "great minds think alike",
                        "gn" : "good night",
                        "g.o.a.t" : "greatest of all time",
                        "goat" : "greatest of all time",
                        "goi" : "get over it",
                        "gps" : "global positioning system",
                        "gr8" : "great",
                        "gratz" : "congratulations",
                        "gyal" : "girl",
                        "h&c" : "hot and cold",
                        "hp" : "horsepower",
                        "hr" : "hour",
                        "hrh" : "his royal highness",
                        "ht" : "height",
                        "ibrb" : "i will be right back",
                        "ic" : "i see",
                        "icq" : "i seek you",
                        "icymi" : "in case you missed it",
                        "idc" : "i do not care",
                        "idgadf" : "i do not give a damn fuck",
                        "idgaf" : "i do not give a fuck",
                        "idk" : "i do not know",
                        "ie" : "that is",
                        "i.e" : "that is",
                        "ifyp" : "i feel your pain",
                        "IG" : "instagram",
                        "iirc" : "if i remember correctly",
                        "ilu" : "i love you",
                        "ily" : "i love you",
                        "imho" : "in my humble opinion",
                        "imo" : "in my opinion",
                        "imu" : "i miss you",
                        "iow" : "in other words",
                        "irl" : "in real life",
                        "j4f" : "just for fun",
                        "jic" : "just in case",
                        "jk" : "just kidding",
                        "jsyk" : "just so you know",
                        "l8r" : "later",
                        "lb" : "pound",
                        "lbs" : "pounds",
                        "ldr" : "long distance relationship",
                        "lmao" : "laugh my ass off",
                        "lmfao" : "laugh my fucking ass off",
                        "lol" : "laughing out loud",
                        "ltd" : "limited",
                        "ltns" : "long time no see",
                        "m8" : "mate",
                        "mf" : "motherfucker",
                        "mfs" : "motherfuckers",
                        "mfw" : "my face when",
                        "mofo" : "motherfucker",
                        "mph" : "miles per hour",
                        "mr" : "mister",
                        "mrw" : "my reaction when",
                        "ms" : "miss",
                        "mte" : "my thoughts exactly",
                        "nagi" : "not a good idea",
                        "nbc" : "national broadcasting company",
                        "nbd" : "not big deal",
                        "nfs" : "not for sale",
                        "ngl" : "not going to lie",
                        "nhs" : "national health service",
                        "nrn" : "no reply necessary",
                        "nsfl" : "not safe for life",
                        "nsfw" : "not safe for work",
                        "nth" : "nice to have",
                        "nvr" : "never",
                        "nyc" : "new york city",
                        "oc" : "original content",
                        "og" : "original",
                        "ohp" : "overhead projector",
                        "oic" : "oh i see",
                        "omdb" : "over my dead body",
                        "omg" : "oh my god",
                        "omw" : "on my way",
                        "p.a" : "per annum",
                        "p.m" : "after midday",
                        "pm" : "prime minister",
                        "poc" : "people of color",
                        "pov" : "point of view",
                        "pp" : "pages",
                        "ppl" : "people",
                        "prw" : "parents are watching",
                        "ps" : "postscript",
                        "pt" : "point",
                        "ptb" : "please text back",
                        "pto" : "please turn over",
                        "qpsa" : "what happens", #"que pasa",
                        "ratchet" : "rude",
                        "rbtl" : "read between the lines",
                        "rlrt" : "real life retweet", 
                        "rofl" : "rolling on the floor laughing",
                        "roflol" : "rolling on the floor laughing out loud",
                        "rotflmao" : "rolling on the floor laughing my ass off",
                        "rt" : "retweet",
                        "ruok" : "are you ok",
                        "sfw" : "safe for work",
                        "sk8" : "skate",
                        "smh" : "shake my head",
                        "sq" : "square",
                        "srsly" : "seriously", 
                        "ssdd" : "same stuff different day",
                        "tbh" : "to be honest",
                        "tbs" : "tablespooful",
                        "tbsp" : "tablespooful",
                        "tfw" : "that feeling when",
                        "thks" : "thank you",
                        "tho" : "though",
                        "thx" : "thank you",
                        "tia" : "thanks in advance",
                        "til" : "today i learned",
                        "tl;dr" : "too long i did not read",
                        "tldr" : "too long i did not read",
                        "tmb" : "tweet me back",
                        "tntl" : "trying not to laugh",
                        "ttyl" : "talk to you later",
                        "u" : "you",
                        "u2" : "you too",
                        "u4e" : "yours for ever",
                        "utc" : "coordinated universal time",
                        "w/" : "with",
                        "w/o" : "without",
                        "w8" : "wait",
                        "wassup" : "what is up",
                        "wb" : "welcome back",
                        "wtf" : "what the fuck",
                        "wtg" : "way to go",
                        "wtpa" : "where the party at",
                        "wuf" : "where are you from",
                        "wuzup" : "what is up",
                        "wywh" : "wish you were here",
                        "yd" : "yard",
                        "ygtr" : "you got that right",
                        "ynk" : "you never know",
                        "zzz" : "sleeping bored and tired"
                        }
            
        sample_typos_slang_pattern = re.compile(r'(?<!\w)(' + '|'.join(re.escape(key) for key in sample_typos_slang.keys()) + r')(?!\w)')
        sample_acronyms_pattern = re.compile(r'(?<!\w)(' + '|'.join(re.escape(key) for key in sample_acronyms.keys()) + r')(?!\w)')
        sample_abbr_pattern = re.compile(r'(?<!\w)(' + '|'.join(re.escape(key) for key in sample_abbr.keys()) + r')(?!\w)')
        
        text = sample_typos_slang_pattern.sub(lambda x: sample_typos_slang[x.group()], text)
        text = sample_acronyms_pattern.sub(lambda x: sample_acronyms[x.group()], text)
        text = sample_abbr_pattern.sub(lambda x: sample_abbr[x.group()], text)
        
        return text

In [24]:
train_df["text_clean"] = train_df["text_clean"].apply(lambda x: other_clean(x))

In [25]:
# double check
print(train_df["comment_text"][10875]) #notice omg
print(train_df["text_clean"][10875])

"   Please refrain from creating inappropriate pages such as OMG LIEK TOTALLY. It is considered vandalism. If you would like to experiment, use the sandbox.  , Recent changes patrol"
   Please refrain from creating inappropriate pages such as OMG LIEK TOTALLY It is considered vandalism If you would like to experiment use the sandbox   Recent changes patrol


In [26]:
print(train_df["comment_text"][20296]) #notice lmao
print(train_df["text_clean"][20296])

Poor Walty is blocked ( LMAO knew it would happen someday.
Poor Walty is blocked  LMAO knew it would happen someday


In [27]:
train_df.shape

(223549, 9)

In [28]:
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,text_clean
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,Explanation\nWhy the edits made under my usern...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,Daww He matches this background colour Im seem...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,Hey man Im really not trying to edit war Its j...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,\nMore\nI cant make any real suggestions on im...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,You sir are my hero Any chance you remember wh...


In [30]:
os.makedirs('Users\Pokie\Downloads', exist_ok=True)  