## Data Processing

In [1]:
import pandas as pd
import krippendorff
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [6]:
df = pd.read_csv("code-mixed-hindi-rh-2300-15-7.csv")

In [7]:
df

Unnamed: 0,clean_text,classification,annotation,agreement
0,secular secular bol bolkar tumlogo ne ishko...,Racial-Hoax,Racial-Hoax,1
1,Inko aatankwadi ghosit karke goli Mar deni cha...,Racial-Hoax,Racial-Hoax,1
2,Yeh Jo Na jisse Fatima Naam Ki Ladki Hai sali ...,Racial-Hoax,Racial-Hoax,1
3,Aaj bhi rajusthan key Kai geon hi jaha pura ge...,Non Racial-Hoax,Non Racial-Hoax,1
4,Sir I support you shi bol rhe hai sir sadi apn...,Non Racial-Hoax,Non Racial-Hoax,1
...,...,...,...,...
2608,Aisa teacher bachhon ko kya padayega jiski So...,Non Racial-Hoax,Non Racial-Hoax,1
2609,Modi ji Modi documentary pr nhi kuch bol diz...,Non Racial-Hoax,Non Racial-Hoax,1
2610,Mujhe Aaj pata Chala savarn itne bade jaativad...,Racial-Hoax,Racial-Hoax,1
2611,Aur tumhe ne apne channel ka naam The Lallanto...,Non Racial-Hoax,Non Racial-Hoax,1


In [8]:
df = df.drop(columns=['agreement'])

In [9]:
df.isna().sum()

clean_text        0
classification    0
annotation        0
dtype: int64

In [10]:
len(df[df['classification'].isnull()])

0

In [11]:
df[df['annotation'].isnull()]

Unnamed: 0,clean_text,classification,annotation


In [12]:
df['annotation_binary'] = df['annotation'].map({'Racial-Hoax': 1, 'Non Racial-Hoax': 0})

In [13]:
df['classification_binary'] = df['classification'].map({'Racial-Hoax': 1, 'Non Racial-Hoax': 0})

In [14]:
df

Unnamed: 0,clean_text,classification,annotation,annotation_binary,classification_binary
0,secular secular bol bolkar tumlogo ne ishko...,Racial-Hoax,Racial-Hoax,1,1
1,Inko aatankwadi ghosit karke goli Mar deni cha...,Racial-Hoax,Racial-Hoax,1,1
2,Yeh Jo Na jisse Fatima Naam Ki Ladki Hai sali ...,Racial-Hoax,Racial-Hoax,1,1
3,Aaj bhi rajusthan key Kai geon hi jaha pura ge...,Non Racial-Hoax,Non Racial-Hoax,0,0
4,Sir I support you shi bol rhe hai sir sadi apn...,Non Racial-Hoax,Non Racial-Hoax,0,0
...,...,...,...,...,...
2608,Aisa teacher bachhon ko kya padayega jiski So...,Non Racial-Hoax,Non Racial-Hoax,0,0
2609,Modi ji Modi documentary pr nhi kuch bol diz...,Non Racial-Hoax,Non Racial-Hoax,0,0
2610,Mujhe Aaj pata Chala savarn itne bade jaativad...,Racial-Hoax,Racial-Hoax,1,1
2611,Aur tumhe ne apne channel ka naam The Lallanto...,Non Racial-Hoax,Non Racial-Hoax,0,0


## Calculating Krippendorff's alpha Value for the code mixed Hindi Racial hoax Dataset

In [15]:
data = df[['annotation_binary', 'classification_binary']].to_numpy()

In [16]:
# Transpose the data so that each column represents an item, and each row represents an annotator
data_t = data.T

# Calculate Krippendorff's alpha at nominal
alpha = krippendorff.alpha(reliability_data=data_t, level_of_measurement='nominal')

print(f"Krippendorff's alpha: {alpha}")

Krippendorff's alpha: 0.7676892784552846


In [17]:
# Calculate Krippendorff's alpha at interval
alpha = krippendorff.alpha(reliability_data=data_t, level_of_measurement='interval')

print(f"Krippendorff's alpha: {alpha}")

Krippendorff's alpha: 0.7676892784552846


In [18]:
df = df.drop(columns=['annotation','annotation_binary','classification'])

In [19]:
df

Unnamed: 0,clean_text,classification_binary
0,secular secular bol bolkar tumlogo ne ishko...,1
1,Inko aatankwadi ghosit karke goli Mar deni cha...,1
2,Yeh Jo Na jisse Fatima Naam Ki Ladki Hai sali ...,1
3,Aaj bhi rajusthan key Kai geon hi jaha pura ge...,0
4,Sir I support you shi bol rhe hai sir sadi apn...,0
...,...,...
2608,Aisa teacher bachhon ko kya padayega jiski So...,0
2609,Modi ji Modi documentary pr nhi kuch bol diz...,0
2610,Mujhe Aaj pata Chala savarn itne bade jaativad...,1
2611,Aur tumhe ne apne channel ka naam The Lallanto...,0


In [20]:
df['classification_binary'].value_counts()

classification_binary
0    1898
1     715
Name: count, dtype: int64

## Calculating Tokens,Number of Posts, Number of Sentences, Average number of Tokens per post,Average number of sentences per post 

In [21]:
##Remove excessive internal whitespace
df['clean_text'] = df['clean_text'].str.split().str.join(' ')

In [22]:
# Convert to lowercase
df['clean_text'] = df['clean_text'].str.lower()

In [23]:
df['clean_text']

0       secular secular bol bolkar tumlogo ne ishko is...
1       inko aatankwadi ghosit karke goli mar deni cha...
2       yeh jo na jisse fatima naam ki ladki hai sali ...
3       aaj bhi rajusthan key kai geon hi jaha pura ge...
4       sir i support you shi bol rhe hai sir sadi apn...
                              ...                        
2608    aisa teacher bachhon ko kya padayega jiski soc...
2609    modi ji modi documentary pr nhi kuch bol dizye...
2610    mujhe aaj pata chala savarn itne bade jaativad...
2611    aur tumhe ne apne channel ka naam the lallanto...
2612    didi ka baat toh sunega hi kiu khi raheneka jy...
Name: clean_text, Length: 2613, dtype: object

In [24]:
duplicate_rows = df[df.duplicated(subset='clean_text', keep=False)]

print("Rows with duplicate 'clean_text' values:")
print(duplicate_rows)

Rows with duplicate 'clean_text' values:
                                             clean_text  classification_binary
1671  kya karan h ye jaatiyon ka bolbaala sirf bhara...                      1
1673  free ka chandan ghis mere nandan kejrudin or d...                      1
1676  kya karan h ye jaatiyon ka bolbaala sirf bhara...                      0
1677  free ka chandan ghis mere nandan kejrudin or d...                      1


In [25]:
df_no_duplicates = df.drop_duplicates(subset='clean_text', keep='first')

In [26]:
df_no_duplicates.shape

(2611, 2)

In [27]:
df_no_duplicates['clean_text'] = df_no_duplicates['clean_text'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_duplicates['clean_text'] = df_no_duplicates['clean_text'].astype(str)


In [28]:
def count_tokens(text):
    tokens = word_tokenize(text)
    return len(tokens)

def count_sentences(text):
    sentences = sent_tokenize(text)
    return len(sentences)

In [29]:
df_no_duplicates['Num_Tokens'] = df_no_duplicates['clean_text'].apply(count_tokens)
df_no_duplicates['Num_Sentences'] = df_no_duplicates['clean_text'].apply(count_sentences)

average_tokens_per_post = df_no_duplicates['Num_Tokens'].mean()
average_sentences_per_post = df_no_duplicates['Num_Sentences'].mean()
total_tokens = df_no_duplicates['Num_Tokens'].sum()
total_senetences = df_no_duplicates['Num_Sentences'].sum()

print("DataFrame with token and sentence counts:")
print(df_no_duplicates.shape)
print(f"total number of tokens : {total_tokens}")
print(f"total number of sentences : {total_senetences}")
print(f"Average number of tokens per post: {average_tokens_per_post}")
print(f"Average number of sentences per post: {average_sentences_per_post}")


DataFrame with token and sentence counts:
(2611, 4)
total number of tokens : 79304
total number of sentences : 2611
Average number of tokens per post: 30.373037150517042
Average number of sentences per post: 1.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_duplicates['Num_Tokens'] = df_no_duplicates['clean_text'].apply(count_tokens)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_duplicates['Num_Sentences'] = df_no_duplicates['clean_text'].apply(count_sentences)


In [30]:
def collect_tokens(text):
    tokens = word_tokenize(text.lower())  # Convert text to lowercase to avoid case-sensitive duplicates
    return tokens

# Apply the function to each post and collect all tokens
all_tokens = df_no_duplicates['clean_text'].apply(collect_tokens).sum()

# Get unique tokens
unique_tokens = set(all_tokens)
vocab_size = len(unique_tokens)


In [31]:
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 12859


## Stratified Train-test Split 

In [32]:
from sklearn.model_selection import train_test_split


train_val, test = train_test_split(df_no_duplicates, test_size=0.2, stratify=df_no_duplicates['classification_binary'], random_state=42)
train, val = train_test_split(train_val, test_size=0.25, stratify=train_val['classification_binary'], random_state=42)

print("Training set:")
print(train)
print("\nValidation set:")
print(val)
print("\nTest set:")
print(test)

Training set:
                                             clean_text  \
858   matlab ek naale saaf karne wale ka beta jo kab...   
823   jo marcuka he unhone vi dekhne wala he yesuko ...   
234   kaha gaye hamare thakur pandit bhai jo reserva...   
2325  salo sirf mike aur stage pe bolna jante ho ,du...   
1575  chalo andhbhakt ye to maan gye ki wo doglapan ...   
...                                                 ...   
2481  badhiya hai is pehchan mai fayda haikaas mai b...   
1574  ye chutiya or ye log apne behno ke nhi hote or...   
2126  ye gaddar hai aur hamesha rahenge jo kah lo in...   
1281  bu3yk to bhai tu ladta rah aur apne bachcho ko...   
957   jaatiwaad se bahut fayeda hai, jati samaj me p...   

      classification_binary  Num_Tokens  Num_Sentences  
858                       0          48              1  
823                       0          25              1  
234                       0          22              1  
2325                      1          26          

In [33]:
train['classification_binary'].value_counts()

classification_binary
0    1138
1     428
Name: count, dtype: int64

In [34]:
val['classification_binary'].value_counts()

classification_binary
0    379
1    143
Name: count, dtype: int64

In [35]:
test['classification_binary'].value_counts()

classification_binary
0    380
1    143
Name: count, dtype: int64

In [32]:
train.to_csv('data/train.csv', index=False, encoding='utf-8')

In [33]:
test.to_csv('data/test.csv', index=False, encoding='utf-8')

In [34]:
val.to_csv('data/val.csv', index=False, encoding='utf-8')

In [36]:
df_no_duplicates['Num_Tokens'].max()

74