In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import string 
import emoji
import re

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords

In [3]:
df = pd.read_csv("sample_df.csv")
df

Unnamed: 0,text,target,Word Count
0,NEW DELHI: The Andhra Pradesh Public Service C...,academic interests,239
1,PUNE: Two weeks after the new academic year ha...,academic interests,500
2,GUWAHATI: The results of the CBSE Class X exam...,academic interests,470
3,"admission into the IIMs,"" said Kapoor. Across ...",academic interests,108
4,Mangaluru: The Mangalore Institute of Technolo...,academic interests,336
...,...,...,...
519995,NAGPUR: Akshay Zadgaonkar is a child prodigy. ...,video gaming,500
519996,\nBayonetta lead Hideki Kamiya reckons the lat...,video gaming,290
519997,\nAl Pacino thinks the original Godfather is b...,video gaming,259
519998,The latest episode of Imlie begins with Aryan ...,video gaming,251


In [4]:
print(len(pd.unique(df.target)))

26


In [5]:
pd.unique(df.target)

array(['academic interests', 'arts and culture', 'automotives',
       'books and literature', 'business and finance', 'careers',
       'family and relationships', 'food and drinks', 'health',
       'healthy living', 'hobbies and interests', 'home and garden',
       'movies', 'music and audio', 'news and politics',
       'personal finance', 'pets',
       'pharmaceuticals, conditions, and symptoms', 'real estate',
       'shopping', 'sports', 'style and fashion',
       'technology and computing', 'television', 'travel', 'video gaming'],
      dtype=object)

### Lowercase the dataset

In [6]:
df.text = df.text.str.lower()
df.text = df.text.astype(str)
df.dtypes

text          object
target        object
Word Count     int64
dtype: object

### Removing Punctuations

In [7]:
# Removing punctuations using string module

def remove_punct(text):
    if isinstance(text, str):
        return text.translate(str.maketrans("", "", string.punctuation))
    elif pd.notna(text):
        return str(text)
    else:
        return text

df['text'] = df['text'].apply(remove_punct)
df

# Removing punctuations using regex 

# import re
# res = ''
# for i in ex:
#     word = re.sub(r'([^\w\s]|_)', '', i)
#     res += word
# res

Unnamed: 0,text,target,Word Count
0,new delhi the andhra pradesh public service co...,academic interests,239
1,pune two weeks after the new academic year has...,academic interests,500
2,guwahati the results of the cbse class x exams...,academic interests,470
3,admission into the iims said kapoor across 13 ...,academic interests,108
4,mangaluru the mangalore institute of technolog...,academic interests,336
...,...,...,...
519995,nagpur akshay zadgaonkar is a child prodigy hi...,video gaming,500
519996,\nbayonetta lead hideki kamiya reckons the lat...,video gaming,290
519997,\nal pacino thinks the original godfather is b...,video gaming,259
519998,the latest episode of imlie begins with aryan ...,video gaming,251


### Removing URLs and Emojis

In [8]:
def removeURLandEmoji(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = emoji.demojize(text)
    text = re.sub(r":[^:]+:", "", text)
    
    return text
    
df.text = df.text.apply(removeURLandEmoji)
df

Unnamed: 0,text,target,Word Count
0,new delhi the andhra pradesh public service co...,academic interests,239
1,pune two weeks after the new academic year has...,academic interests,500
2,guwahati the results of the cbse class x exams...,academic interests,470
3,admission into the iims said kapoor across 13 ...,academic interests,108
4,mangaluru the mangalore institute of technolog...,academic interests,336
...,...,...,...
519995,nagpur akshay zadgaonkar is a child prodigy hi...,video gaming,500
519996,\nbayonetta lead hideki kamiya reckons the lat...,video gaming,290
519997,\nal pacino thinks the original godfather is b...,video gaming,259
519998,the latest episode of imlie begins with aryan ...,video gaming,251


In [9]:
def remove_Emails_and_Numbers(text):
    email_pattern = r'\S+@\S+\.\S+'
    number_pattern = r'\b\d+\b'

    cleaned_text = re.sub(email_pattern, '', text)
    cleaned_text = re.sub(number_pattern, '', cleaned_text)
    return cleaned_text

df.text = df.text.apply(remove_Emails_and_Numbers)
df

Unnamed: 0,text,target,Word Count
0,new delhi the andhra pradesh public service co...,academic interests,239
1,pune two weeks after the new academic year has...,academic interests,500
2,guwahati the results of the cbse class x exams...,academic interests,470
3,admission into the iims said kapoor across ii...,academic interests,108
4,mangaluru the mangalore institute of technolog...,academic interests,336
...,...,...,...
519995,nagpur akshay zadgaonkar is a child prodigy hi...,video gaming,500
519996,\nbayonetta lead hideki kamiya reckons the lat...,video gaming,290
519997,\nal pacino thinks the original godfather is b...,video gaming,259
519998,the latest episode of imlie begins with aryan ...,video gaming,251


In [11]:
import codecs

def removeSpecialChar(text):
    pattern = r'[^a-zA-Z\s]'

    cleaned_text = re.sub(pattern, '', text)
    cleaned_text = cleaned_text.replace('\n', ' ') # remove '\n' with ' '
    cleaned_text = codecs.decode(cleaned_text, 'unicode_escape') # remove '\' with ''
    cleaned_text = cleaned_text.replace('\xa0', ' ') # remove '\xa0' with ' '
    return cleaned_text

df.text = df.text.apply(removeSpecialChar)
df

Unnamed: 0,text,target,Word Count
0,new delhi the andhra pradesh public service co...,academic interests,239
1,pune two weeks after the new academic year has...,academic interests,500
2,guwahati the results of the cbse class x exams...,academic interests,470
3,admission into the iims said kapoor across ii...,academic interests,108
4,mangaluru the mangalore institute of technolog...,academic interests,336
...,...,...,...
519995,nagpur akshay zadgaonkar is a child prodigy hi...,video gaming,500
519996,bayonetta lead hideki kamiya reckons the late...,video gaming,290
519997,al pacino thinks the original godfather is be...,video gaming,259
519998,the latest episode of imlie begins with aryan ...,video gaming,251


In [12]:
def has_html_elements(text):
    try:
        soup = BeautifulSoup(text, 'html.parser')
        return len(soup.find_all()) > 0
    except:
        return True

df['has_html'] = df['text'].apply(has_html_elements)
df = df[~df['has_html']]
df = df.drop('has_html', axis = 1)
df

Unnamed: 0,text,target,Word Count
0,new delhi the andhra pradesh public service co...,academic interests,239
1,pune two weeks after the new academic year has...,academic interests,500
2,guwahati the results of the cbse class x exams...,academic interests,470
3,admission into the iims said kapoor across ii...,academic interests,108
4,mangaluru the mangalore institute of technolog...,academic interests,336
...,...,...,...
519995,nagpur akshay zadgaonkar is a child prodigy hi...,video gaming,500
519996,bayonetta lead hideki kamiya reckons the late...,video gaming,290
519997,al pacino thinks the original godfather is be...,video gaming,259
519998,the latest episode of imlie begins with aryan ...,video gaming,251


### Tokenization

In [13]:
def tokens(text):
    return word_tokenize(text)

df['tokens'] = df['text'].apply(tokens)
df

Unnamed: 0,text,target,Word Count,tokens
0,new delhi the andhra pradesh public service co...,academic interests,239,"[new, delhi, the, andhra, pradesh, public, ser..."
1,pune two weeks after the new academic year has...,academic interests,500,"[pune, two, weeks, after, the, new, academic, ..."
2,guwahati the results of the cbse class x exams...,academic interests,470,"[guwahati, the, results, of, the, cbse, class,..."
3,admission into the iims said kapoor across ii...,academic interests,108,"[admission, into, the, iims, said, kapoor, acr..."
4,mangaluru the mangalore institute of technolog...,academic interests,336,"[mangaluru, the, mangalore, institute, of, tec..."
...,...,...,...,...
519995,nagpur akshay zadgaonkar is a child prodigy hi...,video gaming,500,"[nagpur, akshay, zadgaonkar, is, a, child, pro..."
519996,bayonetta lead hideki kamiya reckons the late...,video gaming,290,"[bayonetta, lead, hideki, kamiya, reckons, the..."
519997,al pacino thinks the original godfather is be...,video gaming,259,"[al, pacino, thinks, the, original, godfather,..."
519998,the latest episode of imlie begins with aryan ...,video gaming,251,"[the, latest, episode, of, imlie, begins, with..."


### Removing stopwords

In [14]:
# removing stopwords

stops = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [token for token in tokens if token.lower() not in stops]

df['tokens'] = df['tokens'].apply(remove_stopwords)
df

Unnamed: 0,text,target,Word Count,tokens
0,new delhi the andhra pradesh public service co...,academic interests,239,"[new, delhi, andhra, pradesh, public, service,..."
1,pune two weeks after the new academic year has...,academic interests,500,"[pune, two, weeks, new, academic, year, begun,..."
2,guwahati the results of the cbse class x exams...,academic interests,470,"[guwahati, results, cbse, class, x, exams, ann..."
3,admission into the iims said kapoor across ii...,academic interests,108,"[admission, iims, said, kapoor, across, iims, ..."
4,mangaluru the mangalore institute of technolog...,academic interests,336,"[mangaluru, mangalore, institute, technology, ..."
...,...,...,...,...
519995,nagpur akshay zadgaonkar is a child prodigy hi...,video gaming,500,"[nagpur, akshay, zadgaonkar, child, prodigy, c..."
519996,bayonetta lead hideki kamiya reckons the late...,video gaming,290,"[bayonetta, lead, hideki, kamiya, reckons, lat..."
519997,al pacino thinks the original godfather is be...,video gaming,259,"[al, pacino, thinks, original, godfather, bett..."
519998,the latest episode of imlie begins with aryan ...,video gaming,251,"[latest, episode, imlie, begins, aryan, receiv..."


In [16]:
df = df.drop(columns = ['text', 'Word Count'])
df

Unnamed: 0,target,tokens
0,academic interests,"[new, delhi, andhra, pradesh, public, service,..."
1,academic interests,"[pune, two, weeks, new, academic, year, begun,..."
2,academic interests,"[guwahati, results, cbse, class, x, exams, ann..."
3,academic interests,"[admission, iims, said, kapoor, across, iims, ..."
4,academic interests,"[mangaluru, mangalore, institute, technology, ..."
...,...,...
519995,video gaming,"[nagpur, akshay, zadgaonkar, child, prodigy, c..."
519996,video gaming,"[bayonetta, lead, hideki, kamiya, reckons, lat..."
519997,video gaming,"[al, pacino, thinks, original, godfather, bett..."
519998,video gaming,"[latest, episode, imlie, begins, aryan, receiv..."


### Lemmatization

In [18]:
# WORDNET LEMMATIZER (with appropriate pos tags)

# nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()

def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_text(tokens):
    pos_tagged = nltk.pos_tag(tokens)
    wordnet_tagged = [(word, pos_tagger(tag)) for word, tag in pos_tagged]
    lemmatized = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized.append(word)
        else:
            lemmatized.append(lemmatizer.lemmatize(word, tag))
    return lemmatized

df['tokens'] = df['tokens'].apply(lemmatize_text)
df

Unnamed: 0,target,tokens
0,academic interests,"[new, delhi, andhra, pradesh, public, service,..."
1,academic interests,"[pune, two, week, new, academic, year, begin, ..."
2,academic interests,"[guwahati, result, cbse, class, x, exam, annou..."
3,academic interests,"[admission, iims, say, kapoor, across, iims, s..."
4,academic interests,"[mangaluru, mangalore, institute, technology, ..."
...,...,...
519995,video gaming,"[nagpur, akshay, zadgaonkar, child, prodigy, c..."
519996,video gaming,"[bayonetta, lead, hideki, kamiya, reckons, lat..."
519997,video gaming,"[al, pacino, think, original, godfather, well,..."
519998,video gaming,"[late, episode, imlie, begin, aryan, receive, ..."


In [25]:
df['tokens'] = df['tokens'].apply(tuple)
df.drop_duplicates(subset=['tokens'], ignore_index=True, inplace=True)
df

Unnamed: 0,target,tokens
0,academic interests,"(new, delhi, andhra, pradesh, public, service,..."
1,academic interests,"(pune, two, week, new, academic, year, begin, ..."
2,academic interests,"(guwahati, result, cbse, class, x, exam, annou..."
3,academic interests,"(admission, iims, say, kapoor, across, iims, s..."
4,academic interests,"(mangaluru, mangalore, institute, technology, ..."
...,...,...
518330,video gaming,"(nagpur, akshay, zadgaonkar, child, prodigy, c..."
518331,video gaming,"(bayonetta, lead, hideki, kamiya, reckons, lat..."
518332,video gaming,"(al, pacino, think, original, godfather, well,..."
518333,video gaming,"(late, episode, imlie, begin, aryan, receive, ..."


In [27]:
 df.target.value_counts()

pharmaceuticals, conditions, and symptoms    20000
academic interests                           19993
movies                                       19985
healthy living                               19984
television                                   19980
family and relationships                     19975
style and fashion                            19973
news and politics                            19973
automotives                                  19965
sports                                       19964
music and audio                              19961
video gaming                                 19957
arts and culture                             19947
pets                                         19944
food and drinks                              19942
hobbies and interests                        19939
travel                                       19930
shopping                                     19929
home and garden                              19928
health                         

In [28]:
df.to_csv('preprocessed_df.csv', index = False)