In [1]:
import subprocess

# List of required packages
required_packages = [
    "pandas",
    "emoji",
    "nltk",
    "langdetect",
    "swifter"
]

# Install each package if not already installed
for package in required_packages:
    subprocess.call(["pip", "install", package])

# Download necessary NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
 import nltk
 nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [3]:
import pandas as pd
import re
import emoji
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from langdetect import detect, DetectorFactory
import swifter

# Ensure deterministic language detection
DetectorFactory.seed = 0

# Install necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
input_file = "mahakumbh_dataset.csv"
df = pd.read_csv(input_file)

# Ensure the correct column name
column_name = "text"

# Define collection words (modify as needed)
collection_words = {"example", "data", "sample"}

# Function to detect language
def detect_language(text):
    try:
        return detect(text) if len(text) >= 5 else "unknown"
    except:
        return "unknown"

# Detect language once and store it
df["lang"] = df[column_name].astype(str).swifter.apply(detect_language)

# --- Step 1: Remove Mentions, Hashtags, and Retweets ---
def remove_tweets(text):
    return re.sub(r"@\w+|#\w+|RT\s+", "", text).strip()

df[column_name] = df[column_name].swifter.apply(remove_tweets)

# --- Step 2: Remove URLs ---
df[column_name] = df[column_name].swifter.apply(lambda text: re.sub(r"http\S+|www\S+|https\S+", "", text))

# --- Step 3: Convert Emojis to Text ---
df[column_name] = df[column_name].swifter.apply(lambda text: emoji.demojize(text, delimiters=(" ", " ")))

# --- Step 4: Remove Stopwords (Before Tokenization) ---
def remove_stopwords(text, lang):
    if lang == "unknown":
        return text
    try:
        stop_words = set(stopwords.words("english")) if lang == "en" else set(stopwords.words(lang))
        words = text.split()  # Temporary word split before tokenization
        return " ".join(word for word in words if word.lower() not in stop_words)
    except:
        return text  # Return original text if stopwords unavailable

df[column_name] = df.swifter.apply(lambda row: remove_stopwords(row[column_name], row["lang"]), axis=1)

# --- Step 5: Stemming & Lemmatization ---
lemmatizer = WordNetLemmatizer()

def apply_stemming_lemmatization(text, lang):
    if lang == "unknown":
        return text
    try:
        stemmer = SnowballStemmer(lang)
        words = text.split()
        return " ".join(lemmatizer.lemmatize(stemmer.stem(word)) for word in words)
    except:
        return text  # Return original text if stemming fails

df[column_name] = df.swifter.apply(lambda row: apply_stemming_lemmatization(row[column_name], row["lang"]), axis=1)

# --- Step 6: Remove Collection Words ---
df[column_name] = df[column_name].swifter.apply(lambda text: " ".join(word for word in text.split() if word not in collection_words))

# --- Step 7: Tokenization (Moved to End) ---
df[column_name] = df[column_name].swifter.apply(word_tokenize)

# --- Save Final Preprocessed Dataset ---
df.to_csv("preprocessed_mahakumbh_data.csv", index=False)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Pandas Apply:   0%|          | 0/11000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/11000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/11000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/11000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/11000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/11000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/11000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/11000 [00:00<?, ?it/s]

In [4]:
import pandas as pd

# Load the final preprocessed dataset
df = pd.read_csv("preprocessed_mahakumbh_data.csv")

# Keep only rows where 'lang' is 'en'
df = df[df["lang"] == "en"]

# Save the filtered dataset
df.to_csv("english_mahakumbh_data.csv", index=False)

Filtered dataset saved as 'english_mahakumbh_data.csv'.
