In [25]:
#Important imports to be added

import pandas as pd
import numpy as np
import re,nltk,swifter
import matplotlib.pyplot as plt
import seaborn as sn
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn import tree
from nltk.stem import WordNetLemmatizer

In [24]:
import nltk
#downloading stopwords library from nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jasleen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [23]:
#reading csv and filling out missing values
df = pd.read_csv(r'C:\Users\Jasleen\Downloads\archive\train-balanced-sarcasm.csv')
df = df.fillna('')


In [4]:
#taking only necessary comments
df = df[['label','comment','author','score','created_utc','parent_comment']]

In [26]:
df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


**Identify Emojies inside the comments**

In [27]:
import emoji

In [7]:
# Function to check if a string contains emojis
def has_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)
    return bool(emoji_pattern.search(text))
 
# Function to replace emojis with their meanings
def replace_emojis(text):
    return emoji.demojize(text)

Clean Comments

In [28]:
#removing some important stopwords for sarcasm detection
stops = set(stopwords.words('english')) - {'no','not','nor','against','above','below','off','own'}
lemmatizer = WordNetLemmatizer()
def clean_text(comment):
    #cleaning the text by removing links, usernames,HTML Tags,expansion of words,username removal,etc.
    text = str(comment)
    # Check for emojis
    has_emojis = has_emoji(text)
    # Replace emojis with meanings
    text = replace_emojis(text)
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',' ',text)
    text = re.sub("<.*?>", " ", text)
    text = re.sub(r"[0-9]+"," ",text)
    text = re.sub(r"@[A-Za-z0-9]+"," ",text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"n\'t", ' not',text)
    text = text.replace('\\r', ' ')
    text = text.replace('\\"', ' ')
    text = text.replace('\\n', ' ')
    text = re.sub('[^A-Za-z0-9]+',' ', text)
    #tokenization
    tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
    tokens = tokenizer.tokenize(text.lower())
    tokens = [token for token in tokens if token not in stops]
    #lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    text = ' '.join(lemmatized_tokens)
    text = text.lower().strip()
    return text, has_emojis

# Apply text cleaning and emoji replacement
df["cleaned_comment"], df["has_emoji"] = zip(*df.swifter.apply(lambda x: clean_text(x["comment"]), axis=1))


Pandas Apply: 100%|██████████| 1010826/1010826 [03:48<00:00, 4415.30it/s]


In [32]:
#testing df
df

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment,cleaned_comment,has_emoji
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ...",nc nh,False
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...,know west team play against west team east tea...,False
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.,underdog earlier today since gronk announcemen...,False
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz,meme not funny none new york nigga one,False
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...,could use one tool,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1010821,1,I'm sure that Iran and N. Korea have the techn...,TwarkMain,reddit.com,2,2,0,2009-04,2009-04-25 00:47:52,"No one is calling this an engineered pathogen,...",sure iran n korea technology create pig bird h...,False
1010822,1,"whatever you do, don't vote green!",BCHarvey,climate,1,1,0,2009-05,2009-05-14 22:27:40,In a move typical of their recent do-nothing a...,whatever not vote green,False
1010823,1,Perhaps this is an atheist conspiracy to make ...,rebelcommander,atheism,1,1,0,2009-01,2009-01-11 00:22:57,Screw the Disabled--I've got to get to Church ...,perhaps atheist conspiracy make christian look...,False
1010824,1,The Slavs got their own country - it is called...,catsi,worldnews,1,1,0,2009-01,2009-01-23 21:12:49,I've always been unsettled by that. I hear a l...,slav got own country called kosovo,False


In [30]:
df['has_emoji'].value_counts()[False]

1010826

**This upper value is same as the total number of comments, which shows that we don't have any emojis in the dataset**

Slang detection and Mapping

In [33]:
df

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment,cleaned_comment,has_emoji
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ...",nc nh,False
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...,know west team play against west team east tea...,False
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.,underdog earlier today since gronk announcemen...,False
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz,meme not funny none new york nigga one,False
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...,could use one tool,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1010821,1,I'm sure that Iran and N. Korea have the techn...,TwarkMain,reddit.com,2,2,0,2009-04,2009-04-25 00:47:52,"No one is calling this an engineered pathogen,...",sure iran n korea technology create pig bird h...,False
1010822,1,"whatever you do, don't vote green!",BCHarvey,climate,1,1,0,2009-05,2009-05-14 22:27:40,In a move typical of their recent do-nothing a...,whatever not vote green,False
1010823,1,Perhaps this is an atheist conspiracy to make ...,rebelcommander,atheism,1,1,0,2009-01,2009-01-11 00:22:57,Screw the Disabled--I've got to get to Church ...,perhaps atheist conspiracy make christian look...,False
1010824,1,The Slavs got their own country - it is called...,catsi,worldnews,1,1,0,2009-01,2009-01-23 21:12:49,I've always been unsettled by that. I hear a l...,slav got own country called kosovo,False


Named Entity Recognition (NER)

In [47]:
import spacy

# Load the English NER model from spaCy
nlp = spacy.load("en_core_web_sm")

def perform_ner(text):
    try:
        # Process the text with spaCy NER model
        doc = nlp(text)
        
        # Extract named entities from the document
        named_entities = [ent.text for ent in doc.ents]
        
        return named_entities
    except Exception as e:
        print(f"Error processing text: {text}. Error: {e}")
        return []

# Apply NER to each text entry in the DataFrame
df['named_entities'] = df['cleaned_ comment'].apply(perform_ner)

# Print DataFrame with named entities
print(df)




         label                                            comment  \
0            0                                         NC and NH.   
1            0  You do know west teams play against west teams...   
2            0  They were underdogs earlier today, but since G...   
3            0  This meme isn't funny none of the "new york ni...   
4            0                    I could use one of those tools.   
...        ...                                                ...   
1010821      1  I'm sure that Iran and N. Korea have the techn...   
1010822      1                 whatever you do, don't vote green!   
1010823      1  Perhaps this is an atheist conspiracy to make ...   
1010824      1  The Slavs got their own country - it is called...   
1010825      1  values, as in capitalism .. there is good mone...   

                 author           subreddit  score  ups  downs     date  \
0             Trumpbart            politics      2   -1     -1  2016-10   
1             Shbshb9