In [None]:
# We'll use several data cleaning steps to clean our raw data and convert it to well formatted strings which are suitable for ML/DL models.
# Data cleaning steps include:

# remove nulls
# Removal of URLs/links
# Removal of HTML tags
# Conversion to lower case strings
# Remove words with numbers
# Remove punctuations
# Removal of whitespaces
# Decontraction of words
# Removal of emojis/emoticons

In [1]:
import re
import pandas as pd
import emoji

In [8]:
#creating functions
def removeLinks(text):
    text = re.sub(r'http\S+', '', text, flags=re.MULTILINE)
    return text

def removeHTMLTags(text):
    text = re.sub(r'<.*?>', '', text)
    return text

def convertToLowerCase(text):
    return text.lower()

def removePuctuations(text):
    text = re.sub(r'[^\w\s]', '', text)
    return text

def remove_emoji(text):
    return emoji.replace_emoji(text, replace='')

def remove_emoticons(text):
    emoticon_pattern = re.compile(r'(:\)|:\(|:D|:P|:\*|;\)|:\||:-\)|:-\(|:-D|:-P|:-\*|;-\)|:O|:\[|:\]|:\{|:\}|:@|:\$|8\)|B\)|<3|:\^|\):&|:-\||:-\$|:-&|:-o|:-O|:-@)')
    return emoticon_pattern.sub(r'', text)
    
def removeWhiteSpaces(text):
    return text.strip()

In [3]:
def clean_data(text):
    text = str(text)
    text = removeLinks(text)
    text = removeHTMLTags(text)
    text = convertToLowerCase(text)
    text = removePuctuations(text)
    text = remove_emoji(text)
    text = remove_emoticons(text)
    text = text.strip()
    return text

In [7]:
train = pd.read_csv('/Users/jashanjeetsingh/Downloads/train-balanced-sarcasm.csv')

# Dropping rows with missing values
cleaned_train = train.dropna(how='any', axis=0)

# Applying cleaning functions using .loc to avoid SettingWithCopyWarning error
cleaned_train.loc[:, 'comment'] = cleaned_train['comment'].apply(clean_data)
cleaned_train.loc[:, 'author'] = cleaned_train['author'].apply(removeWhiteSpaces)

print(cleaned_train)

         label                                            comment   
0            0                                          nc and nh  \
1            0  you do know west teams play against west teams...   
2            0  they were underdogs earlier today but since gr...   
3            0  this meme isnt funny none of the new york nigg...   
4            0                     i could use one of those tools   
...        ...                                                ...   
1010821      1  im sure that iran and n korea have the technol...   
1010822      1                    whatever you do dont vote green   
1010823      1  perhaps this is an atheist conspiracy to make ...   
1010824      1  the slavs got their own country  it is called ...   
1010825      1  values as in capitalism  there is good money i...   

                 author           subreddit  score  ups  downs     date   
0             Trumpbart            politics      2   -1     -1  2016-10  \
1             Shbshb9

In [9]:
#Jashan