In [213]:
#Important imports to be added

import pandas as pd
import numpy as np
import re,nltk,swifter
import matplotlib.pyplot as plt
import seaborn as sn
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn import tree
from nltk.stem import WordNetLemmatizer

In [214]:
import nltk
#downloading stopwords library from nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\patel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [215]:
#reading csv and filling out missing values
df = pd.read_csv(r'train-balanced-sarcasm.csv')
# Sample 150,000 random rows
df = df.sample(n=10000, random_state=42)
df = df.fillna('')

In [216]:
#taking only necessary comments
df = df[['label','comment','author','score','created_utc','parent_comment']]

In [217]:
df.head()

Unnamed: 0,label,comment,author,score,created_utc,parent_comment
608627,1,And he sure as hell is successful!,armiechedon,1,2015-12-09 23:41:48,Obama's smoked before? Or at least admitted it?
456977,1,"wait, you have to win the candidates to challe...",killingfeels,1,2016-02-13 00:04:43,You can say the same thing about any chess pla...
803801,1,I mean how could anyone not see that it is ant...,Gnofar,5,2014-12-25 21:59:49,That isn't Hanako?
926708,0,"Funny thing, most of the girl pants I've bough...",smischmal,2,2012-03-05 02:03:43,Girl Pants (X/post from TwoXChromosomes)
129279,0,"Also Ranch dressing, because LBJ was Texan, an...",ElScreecho,28,2016-09-19 19:20:53,Lyndon Johnson salad: A carrot and 2 beetroots...


**Identify Emojies inside the comments**

In [218]:
import emoji

In [219]:
# Function to check if a string contains emojis
def has_emoji(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F700-\U0001F77F"  # alchemical symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)
    return bool(emoji_pattern.search(text))
 
# Function to replace emojis with their meanings
def replace_emojis(text):
    return emoji.demojize(text)

In [220]:
#function to lowecase all except all caps word
def lowercase_except_caps(sentence):
  #print(sentence)
  words = sentence.split()
  modified_words = []
  for word in words:
    if word.isupper():
      modified_words.append(word)
    else:
      modified_words.append(word.lower())
  return " ".join(modified_words)

Clean Comments

In [221]:
#removing some important stopwords for sarcasm detection
stops = set(stopwords.words('english')) - {'no','not','nor','against','above','below','off','own'}
lemmatizer = WordNetLemmatizer()
def clean_text(comment):
    #cleaning the text by removing links, usernames,HTML Tags,expansion of words,username removal,etc.
    text = str(comment)
    # Check for emojis
    has_emojis = has_emoji(text)
    # Replace emojis with meanings
    text = replace_emojis(text)
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',' ',text)
    text = re.sub("<.*?>", " ", text)
    text = re.sub(r"[0-9]+"," ",text)
    text = re.sub(r"@[A-Za-z0-9]+"," ",text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"n\'t", ' not',text)
    text = re.sub(r"WON\'T", "will not", text)
    text = re.sub(r"CAN\'T", "can not", text)
    text = re.sub(r"N\'T", " not", text)
    text = re.sub(r"\'RE", " are", text)
    text = re.sub(r"\'S", " is", text)
    text = re.sub(r"\'D", " would", text)
    text = re.sub(r"\'LL", " will", text)
    text = re.sub(r"\'T", " not", text)
    text = re.sub(r"\'VE", " have", text)
    text = re.sub(r"\'M", " am", text)
    text = re.sub(r"N\'T", ' not',text)
    text = text.replace('\\r', ' ')
    text = text.replace('\\"', ' ')
    text = text.replace('\\n', ' ')
    return text, has_emojis

# Apply text cleaning and emoji replacement
df["cleaned_comment"], df["has_emoji"] = zip(*df.swifter.apply(lambda x: clean_text(x["comment"]), axis=1))


Pandas Apply: 100%|██████████| 10000/10000 [00:02<00:00, 3638.70it/s]


In [222]:
#testing df
df

Unnamed: 0,label,comment,author,score,created_utc,parent_comment,cleaned_comment,has_emoji
608627,1,And he sure as hell is successful!,armiechedon,1,2015-12-09 23:41:48,Obama's smoked before? Or at least admitted it?,And he sure as hell is successful!,False
456977,1,"wait, you have to win the candidates to challe...",killingfeels,1,2016-02-13 00:04:43,You can say the same thing about any chess pla...,"wait, you have to win the candidates to challe...",False
803801,1,I mean how could anyone not see that it is ant...,Gnofar,5,2014-12-25 21:59:49,That isn't Hanako?,I mean how could anyone not see that it is ant...,False
926708,0,"Funny thing, most of the girl pants I've bough...",smischmal,2,2012-03-05 02:03:43,Girl Pants (X/post from TwoXChromosomes),"Funny thing, most of the girl pants I have bou...",False
129279,0,"Also Ranch dressing, because LBJ was Texan, an...",ElScreecho,28,2016-09-19 19:20:53,Lyndon Johnson salad: A carrot and 2 beetroots...,"Also Ranch dressing, because LBJ was Texan, an...",False
...,...,...,...,...,...,...,...,...
989368,0,And the drop-bears... can't forget those fuzzy...,Bcteagirl,1,2011-10-27 20:09:03,That's why the food is expensive. The farmer's...,And the drop-bears... can not forget those fuz...,False
292333,0,o7 Hail!,AvalonTreman,4,2016-07-03 19:36:19,"Another Avalon! Hail, brother.",o Hail!,False
546581,1,3,tangemann,4,2015-08-04 19:31:59,"On a scale of one to Gaben, how empty is your ...",,False
936260,1,Repetitive fire hydrant and trash can placing ...,UsedCheese,43,2013-08-02 16:16:11,Screenshot Analysis,Repetitive fire hydrant and trash can placing ...,False


In [223]:
df['has_emoji'].value_counts()[False]

10000

**This upper value is same as the total number of comments, which shows that we don't have any emojis in the dataset**

**Abbreviation detection and replacement**

In [224]:
#csv to dataframe
abbrslang=pd.read_csv('Abbr_Slang_Final.csv')
abbrslang

Unnamed: 0,Abbr/Slang,Meaning
0,@TEOTD,At the end of the day
1,143,I love you
2,2G2BT,Too good to be true
3,404,I don't know
4,4EAE,Forever and ever
...,...,...
450,BS,Bullshit
451,NYC,New York City
452,VR,Virtual Reality
453,GTA,Grand Theft Auto


In [225]:
abbrslangDict=abbrslang.to_dict(orient='records')
abbrslangDict

[{'Abbr/Slang': '@TEOTD', 'Meaning': 'At the end of the day'},
 {'Abbr/Slang': '143', 'Meaning': 'I love you'},
 {'Abbr/Slang': '2G2BT', 'Meaning': 'Too good to be true'},
 {'Abbr/Slang': '404', 'Meaning': "I don't know"},
 {'Abbr/Slang': '4EAE', 'Meaning': 'Forever and ever'},
 {'Abbr/Slang': 'A3', 'Meaning': 'Anytime, anywhere, anyplace'},
 {'Abbr/Slang': 'AAF', 'Meaning': 'As a matter of fact'},
 {'Abbr/Slang': 'AAK', 'Meaning': 'Asleep at keyboard'},
 {'Abbr/Slang': 'AAMOI', 'Meaning': 'As a matter of interest'},
 {'Abbr/Slang': 'AAP', 'Meaning': 'Always a pleasure'},
 {'Abbr/Slang': 'AAR', 'Meaning': 'At any rate'},
 {'Abbr/Slang': 'AAS', 'Meaning': 'Alive and smiling'},
 {'Abbr/Slang': 'ACK', 'Meaning': 'Acknowledge'},
 {'Abbr/Slang': 'ADD', 'Meaning': 'Address'},
 {'Abbr/Slang': 'ADN', 'Meaning': 'Any day now'},
 {'Abbr/Slang': 'ADR', 'Meaning': 'Address'},
 {'Abbr/Slang': 'AEAP', 'Meaning': 'As early as possible'},
 {'Abbr/Slang': 'AFAIK', 'Meaning': 'As far as I know'},
 {'Abb

In [226]:
df

Unnamed: 0,label,comment,author,score,created_utc,parent_comment,cleaned_comment,has_emoji
608627,1,And he sure as hell is successful!,armiechedon,1,2015-12-09 23:41:48,Obama's smoked before? Or at least admitted it?,And he sure as hell is successful!,False
456977,1,"wait, you have to win the candidates to challe...",killingfeels,1,2016-02-13 00:04:43,You can say the same thing about any chess pla...,"wait, you have to win the candidates to challe...",False
803801,1,I mean how could anyone not see that it is ant...,Gnofar,5,2014-12-25 21:59:49,That isn't Hanako?,I mean how could anyone not see that it is ant...,False
926708,0,"Funny thing, most of the girl pants I've bough...",smischmal,2,2012-03-05 02:03:43,Girl Pants (X/post from TwoXChromosomes),"Funny thing, most of the girl pants I have bou...",False
129279,0,"Also Ranch dressing, because LBJ was Texan, an...",ElScreecho,28,2016-09-19 19:20:53,Lyndon Johnson salad: A carrot and 2 beetroots...,"Also Ranch dressing, because LBJ was Texan, an...",False
...,...,...,...,...,...,...,...,...
989368,0,And the drop-bears... can't forget those fuzzy...,Bcteagirl,1,2011-10-27 20:09:03,That's why the food is expensive. The farmer's...,And the drop-bears... can not forget those fuz...,False
292333,0,o7 Hail!,AvalonTreman,4,2016-07-03 19:36:19,"Another Avalon! Hail, brother.",o Hail!,False
546581,1,3,tangemann,4,2015-08-04 19:31:59,"On a scale of one to Gaben, how empty is your ...",,False
936260,1,Repetitive fire hydrant and trash can placing ...,UsedCheese,43,2013-08-02 16:16:11,Screenshot Analysis,Repetitive fire hydrant and trash can placing ...,False


In [227]:
import re
def expand_abbr(text, abbr_dict):
    # Regex for 2 or more capital letters (excluding common abbreviations)
    pattern1 = r"\b([A-Z]{2,}(?!\.[A-Z]{2,}))\b"

    # Regex for common abbreviations (Mr., Mrs., etc.)
    pattern2 = r"\b([A-Z]{2,}\.?)\b"

    # Combine both patterns
    combined_pattern = re.compile("|".join([pattern1, pattern2]))

    detected_abbr = []  # List to store detected abbreviations

    def expand_match(match):
        matched_abbr = match.group()
        detected_abbr.append(matched_abbr)  # Store detected abbreviation
        for entry in abbr_dict:
            if matched_abbr.upper() == entry['Abbr/Slang']:
                return entry['Meaning']
        return matched_abbr  # Use default if not found

    if not isinstance(text, str):
        text = str(text)  # Convert non-string inputs to strings

    expanded_text = combined_pattern.sub(expand_match, text)

    return expanded_text, detected_abbr

df['abbr_cleaned_comment'], df['detected_abbr'] = zip(*df['cleaned_comment'].apply(expand_abbr, args=(abbrslangDict,)))
df


Unnamed: 0,label,comment,author,score,created_utc,parent_comment,cleaned_comment,has_emoji,abbr_cleaned_comment,detected_abbr
608627,1,And he sure as hell is successful!,armiechedon,1,2015-12-09 23:41:48,Obama's smoked before? Or at least admitted it?,And he sure as hell is successful!,False,And he sure as hell is successful!,[]
456977,1,"wait, you have to win the candidates to challe...",killingfeels,1,2016-02-13 00:04:43,You can say the same thing about any chess pla...,"wait, you have to win the candidates to challe...",False,"wait, you have to win the candidates to challe...",[]
803801,1,I mean how could anyone not see that it is ant...,Gnofar,5,2014-12-25 21:59:49,That isn't Hanako?,I mean how could anyone not see that it is ant...,False,I mean how could anyone not see that it is ant...,[]
926708,0,"Funny thing, most of the girl pants I've bough...",smischmal,2,2012-03-05 02:03:43,Girl Pants (X/post from TwoXChromosomes),"Funny thing, most of the girl pants I have bou...",False,"Funny thing, most of the girl pants I have bou...",[]
129279,0,"Also Ranch dressing, because LBJ was Texan, an...",ElScreecho,28,2016-09-19 19:20:53,Lyndon Johnson salad: A carrot and 2 beetroots...,"Also Ranch dressing, because LBJ was Texan, an...",False,"Also Ranch dressing, because LBJ was Texan, an...",[LBJ]
...,...,...,...,...,...,...,...,...,...,...
989368,0,And the drop-bears... can't forget those fuzzy...,Bcteagirl,1,2011-10-27 20:09:03,That's why the food is expensive. The farmer's...,And the drop-bears... can not forget those fuz...,False,And the drop-bears... can not forget those fuz...,[]
292333,0,o7 Hail!,AvalonTreman,4,2016-07-03 19:36:19,"Another Avalon! Hail, brother.",o Hail!,False,o Hail!,[]
546581,1,3,tangemann,4,2015-08-04 19:31:59,"On a scale of one to Gaben, how empty is your ...",,False,,[]
936260,1,Repetitive fire hydrant and trash can placing ...,UsedCheese,43,2013-08-02 16:16:11,Screenshot Analysis,Repetitive fire hydrant and trash can placing ...,False,Repetitive fire hydrant and trash can placing ...,[]


In [228]:
from collections import Counter
import pandas as pd

# Flatten the list of lists into a single list
detected_abbr_flat = [word for sublist in df['detected_abbr'] if sublist for word in sublist]

# Count the frequency of each word
word_freq = Counter(detected_abbr_flat)

# Convert the Counter object to a DataFrame
word_freq_df = pd.DataFrame.from_dict(word_freq, orient='index', columns=['Frequency'])

# Sort the DataFrame by frequency in descending order
word_freq_df = word_freq_df.sort_values(by='Frequency', ascending=False)

# Save the DataFrame to a CSV file
word_freq_df.to_csv('word_frequency.csv')


In [229]:
df.to_csv('log1.csv', index=False)

In [230]:
'''#after
text = re.sub('[^A-Za-z0-9]+',' ', text) 
#tokenization
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
tokens = tokenizer.tokenize(text)
tokens = [token for token in tokens if token not in stops]
#lemmatization
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
text = ' '.join(lemmatized_tokens)
#print("\n",text,"----",type(text))
text = lowercase_except_caps(text)'''

'#after\ntext = re.sub(\'[^A-Za-z0-9]+\',\' \', text) \n#tokenization\ntokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)\ntokens = tokenizer.tokenize(text)\ntokens = [token for token in tokens if token not in stops]\n#lemmatization\nlemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]\ntext = \' \'.join(lemmatized_tokens)\n#print("\n",text,"----",type(text))\ntext = lowercase_except_caps(text)'