In [1]:
# Loading packages
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
# Reading both csv files 
df_sarcastic = pd.read_csv('Sarcasm_Hindi_Tweets-SARCASTIC.csv')
df_non_sarcastic = pd.read_csv('Sarcasm_Hindi_Tweets-NON-SARCASTIC.csv')

#Providing labels as sarcastic and non-sarcastic
df_sarcastic['label'] = 'sarcastic'
df_non_sarcastic['label'] = 'non_sarcastic'

#Concatenating them
df = pd.concat([df_sarcastic, df_non_sarcastic], axis=0)

#Dropping off columns that we dont need
df = df.drop(['username','acctdesc','location','following','followers', 'totaltweets', 'usercreatedts', 'tweetcreatedts', 'retweetcount', 'hashtags'] ,axis=1)
print(df)

                                                    text          label
0      हमारे गांव में एक मास्टर जी रहते थे, वो मोहल्ल...      sarcastic
1      हमारे गांव में एक मास्टर जी रहते थे, वो मोहल्ल...      sarcastic
2      हमारे गांव में एक मास्टर जी रहते थे, वो मोहल्ल...      sarcastic
3      हमारे गांव में एक मास्टर जी रहते थे, वो मोहल्ल...      sarcastic
4      हमारे गांव में एक मास्टर जी रहते थे, वो मोहल्ल...      sarcastic
...                                                  ...            ...
10123  स्वर्गीय श्री सुशांत सिंह राजपूत ने हमें छिछोर...  non_sarcastic
10124  बॉलीवुड एक्टर #सुशांत_सिंह_राजपूत ने #फांसी लग...  non_sarcastic
10125  #SushantSinghRajput #Suicide  क्यों?? चमक धमक ...  non_sarcastic
10126  हर हाल में #खुश रहना सीख लो🙃\n        उस दिल क...  non_sarcastic
10127  इतना कुछ पाने के बाद भी अगर इंसान जिदंगी से हा...  non_sarcastic

[16179 rows x 2 columns]


In [3]:
#Reseting the index 
df = df.reset_index()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16179 entries, 0 to 16178
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   index   16179 non-null  int64 
 1   text    16179 non-null  object
 2   label   16179 non-null  object
dtypes: int64(1), object(2)
memory usage: 379.3+ KB


In [4]:
#Dropping off the index column
df = df.drop('index',axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16179 entries, 0 to 16178
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    16179 non-null  object
 1   label   16179 non-null  object
dtypes: object(2)
memory usage: 252.9+ KB


In [15]:
# Importing libraries and packages 
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [6]:
url_pattern = re.compile(r'http\S+|www\S+')
username_pattern = re.compile(r'@[^\s]+')
hashtag_pattern = re.compile(r'#\w+')
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons, which includes smileys and other face expressions.
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs like weather symbols, landscapes, buildings, etc.
        u"\U0001F680-\U0001F6FF"  # transport & map symbols such as vehicles, traffic signs, etc.
        u"\U0001F1E0-\U0001F1FF"  # regional indicator symbols used for flag representation (e.g., country flags).
                           "]+", flags=re.UNICODE) #ensures that the regular expression operates in Unicode mode.

In [7]:
# Dropping off duplicates
df = df.drop_duplicates(keep='first')# dropping duplicates except the first occurence
df.tail()

Unnamed: 0,text,label
10095,https://t.co/HnIURjsvMT अंतर्राष्ट्रीय योग दिव...,non_sarcastic
12116,@Jaihindsonu59 @SinghChunni @SATYAPR27477125 क...,non_sarcastic
14144,https://t.co/qzgPRYlS1h प्रदेश में आज से कोरोन...,non_sarcastic
14146,"Zyada achey b mat bano, yad rkho k meethey pha...",non_sarcastic
14147,"zindagi ki yhi reet hai, haar k baad hi jeet h...",non_sarcastic


In [8]:
# Concatenating both the stopwords(words which don't have significant meaning in a statement/text) 
# This combined list can then be used in subsequent text processing tasks to remove common, non-informative words from text data
stopwords_hindi = ['आपका','अंदर', 'अत', 'अपना', 'अपनी', 'अपने', 'अभी', 'आदि', 'आप',
                   'इत्यादि', 'इन', 'इनका', 'इन्हीं', 'इन्हें', 'इन्हों', 'इस', 'इसका', 'इसकी',
                   'इसके', 'इसमें', 'इसी', 'इसे', 'उन', 'उनका', 'उनकी', 'उनके', 'उनको',
                   'उन्हीं', 'उन्हें', 'उन्हों', 'उस', 'उसके', 'उसी', 'उसे', 'एक', 'एवं', 'एस',
                   'ऐसे', 'और', 'कई', 'कर','करता', 'करते', 'करना', 'करने', 'करें', 'कहते', 'कहा',
                   'का', 'काफ़ी', 'कि', 'कितना', 'किन्हें', 'किन्हों', 'किया', 'किर', 'किस', 'किसी', 'किसे',
                   'की', 'कुछ', 'कुल', 'के', 'को', 'कोई', 'कौन', 'कौनसा', 'गया', 'घर', 'जब', 'जहाँ',
                   'जा', 'जितना', 'जिन', 'जिन्हें', 'जिन्हों', 'जिस', 'जिसे', 'जीधर', 'जैसा', 'जैसे', 'जो',
                   'तक', 'तब', 'तरह', 'तिन', 'तिन्हें', 'तिन्हों', 'तिस', 'तिसे', 'तो', 'था', 'थी', 'थे',
                   'दबारा', 'दिया', 'दुसरा', 'दूसरे', 'दो', 'द्वारा', 'न', 'नहीं', 'ना', 'निहायत', 'नीचे', 'ने',
                   'पर', 'पर', 'पहले', 'पूरा', 'पे', 'फिर', 'बनी', 'बही', 'बहुत', 'बाद', 'बाला', 'बिलकुल',
                   'भी', 'भीतर', 'मगर', 'मानो', 'मे', 'में', 'यदि', 'यह', 'यहाँ', 'यही', 'या'
                  'तुम','मेरी','मुझे','क्योंकि','हम','प्रति','अबकी','आगे','माननीय','शहर','बताएं','कौनसी'
                   'क्लिक','किसकी','बड़े','मैं','and','रही','आज','लें','आपके','मिलकर','सब','मेरे','जी'
                   'श्री', 'यिह', 'ये', 'रखें', 'रहा', 'रहे', 'ऱ्वासा', 'लिए', 'लिये', 'लेकिन', 'व', 'वर्ग', 'वह',
                   'वहाँ', 'वहीं', 'वाले', 'वुह', 'वे', 'वग़ैरह', 'संग', 'सकता', 'सकते', 'सबसे', 'सभी', 'साथ',
                   'साबुत', 'साभ', 'सारा', 'से', 'सो', 'ही', 'हुआ', 'हुई', 'हुए', 'है', 'हैं', 'हो', 'होता', 'होती',
                   'होते', 'होना', 'होने', 'अपनि', 'जेसे', 'होति', 'सभि', 'तिंहों', 'इंहों', 'दवारा', 'इसि', 'किंहें',
                   'थि', 'उंहों', 'ओर', 'जिंहें', 'वहिं', 'अभि', 'बनि', 'हि', 'उंहिं', 'उंहें', 'हें', 'वगेरह', 'एसे',
                   'रवासा', 'कोन', 'निचे', 'काफि', 'उसि', 'पुरा', 'भितर', 'हे', 'बहि', 'वहां', 'कोइ', 'यहां',
                   'जिंहों', 'तिंहें', 'किसि', 'कइ', 'यहि', 'इंहिं', 'जिधर', 'इंहें', 'अदि', 'इतयादि', 'हुइ', 'कोनसा',
                   'इसकि', 'दुसरे', 'जहां', 'अप', 'किंहों', 'उनकि', 'भि', 'वरग', 'हुअ', 'जेसा', 'नहिं']

stopwords_english = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're",
                "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him',
                'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its',
                'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who',
                'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was',
                'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did',
                'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
                'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
                'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
                'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
                'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
                'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
                'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should',
                "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't",
                'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
                "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',
                "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn',
                "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

stopwords = set(stopwords_hindi + stopwords_english)

In [9]:
# Defining cleaning functions
def clean_text(text):
    text = url_pattern.sub('', text)  # Remove URLs
    text = username_pattern.sub('', text)  # Remove usernames
    text = hashtag_pattern.sub('<Hash>', text)  # Replace hashtags
    text = emoji_pattern.sub('', text)  # Remove emojis
    words = text.split()
    words = [word.lower() for word in words if word.lower() not in stopwords]
    return ' '.join(words)

In [10]:
# Applying the cleaning function to text data
df['clean_text'] = df['text'].apply(clean_text)

In [11]:
df.tail()

Unnamed: 0,text,label,clean_text
10095,https://t.co/HnIURjsvMT अंतर्राष्ट्रीय योग दिव...,non_sarcastic,अंतर्राष्ट्रीय योग दिवस उपलक्ष फिट योग संस्थान...
12116,@Jaihindsonu59 @SinghChunni @SATYAPR27477125 क...,non_sarcastic,कितने <hash>ीरान दिन सूनी <hash>ाते... हर <has...
14144,https://t.co/qzgPRYlS1h प्रदेश में आज से कोरोन...,non_sarcastic,प्रदेश कोरोना जन जागरूकता अभियान 21 जून 30 जून...
14146,"Zyada achey b mat bano, yad rkho k meethey pha...",non_sarcastic,"zyada achey b mat bano, yad rkho k meethey pha..."
14147,"zindagi ki yhi reet hai, haar k baad hi jeet h...",non_sarcastic,"zindagi ki yhi reet hai, haar k baad hi jeet h..."


In [12]:
# Tokenizing the text
df['tokens'] = df['clean_text'].apply(lambda x: x.split())

In [13]:
# Flattening list of tokens to fit LabelEncoder input format
all_tokens = [token for sublist in df['tokens'].tolist() for token in sublist]
unique_tokens = list(set(all_tokens))

In [16]:
# Label Encoding
encoder = LabelEncoder()
encoder.fit(unique_tokens)

In [17]:
# Transforming tokens into label encoded vectors and applying them to tokens
def encode_tokens(tokens):
    return encoder.transform(tokens)
df['encoded_tokens'] = df['tokens'].apply(encode_tokens)

In [18]:
# Padding encoded tokens to ensure equal length vectors
# This function is designed to ensure that all encoded token sequences have the same length by padding shorter sequences with zeros.
# This is necessary because machine learning models typically require input data to be of uniform shape. 
max_length = df['encoded_tokens'].apply(len).max()
df['padded_encoded_tokens'] = df['encoded_tokens'].apply(lambda x: np.pad(x, (0, max_length - len(x)), 'constant'))

# Convert padded_encoded_tokens to a regular NumPy array
X = np.array(df['padded_encoded_tokens'].tolist())
y = df['label'].values

In [19]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Model Training
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [21]:
# Predictions
y_pred = rf.predict(X_test)

In [22]:
# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.85
