In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import string 
import emoji
import re

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords

In [3]:
df = pd.read_csv("sample_df.csv")
df.head()

Unnamed: 0,text,target
0,"Last summer my granddaughter, Brooke, played i...",academic interests
1,NEW DELHI: Uttar Pradesh Police Recruitment Bo...,academic interests
2,BELGAUM: Karnataka Higher Education minister ...,academic interests
3,Lincoln University has welcomed the governmen...,academic interests
4,NEW DELHI: The Uttar Pradesh Public Service Co...,academic interests


In [4]:
print(len(pd.unique(df.target)))

26


In [5]:
pd.unique(df.target)

array(['academic interests', 'arts and culture', 'automotive',
       'books and literature', 'business and finance', 'careers',
       'family and relationships', 'food and drink', 'health',
       'healthy living', 'hobbies and interests', 'home and garden',
       'movies', 'music and audio', 'news and politics',
       'personal finance', 'pets',
       'pharmaceuticals, conditions, and symptoms', 'real estate',
       'shopping', 'sports', 'style and fashion',
       'technology and computing', 'television', 'travel', 'video gaming'],
      dtype=object)

### Lowercase the dataset

In [6]:
df.text = df.text.str.lower()
# df.head(10)
df.text = df.text.astype(str)
df.dtypes

text      object
target    object
dtype: object

### Removing Punctuations

In [7]:
# Removing punctuations using string module

def remove_punct(text):
    if isinstance(text, str):
        return text.translate(str.maketrans("", "", string.punctuation))
    elif pd.notna(text):
        return str(text)
    else:
        return text

df['text'] = df['text'].apply(remove_punct)
df
# Removing punctuations using regex 

# import re
# res = ''
# for i in ex:
#     word = re.sub(r'([^\w\s]|_)', '', i)
#     res += word
# res

Unnamed: 0,text,target
0,last summer my granddaughter brooke played in ...,academic interests
1,new delhi uttar pradesh police recruitment boa...,academic interests
2,belgaum karnataka higher education minister c...,academic interests
3,lincoln university has welcomed the governmen...,academic interests
4,new delhi the uttar pradesh public service com...,academic interests
...,...,...
517707,\ndeath stranding 2 has been announced at the ...,video gaming
517708,raj kundra dons multiple hats but one thing th...,video gaming
517709,\nmario rabbids sparks of hope one of the bes...,video gaming
517710,ahmedabad a 38yearold man from adalaj who marr...,video gaming


### Removing URLs and Emojis

In [8]:
def removeURLandEmoji(text):
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = emoji.demojize(text)
    text = re.sub(r":[^:]+:", "", text)
    
    return text
    
df.text = df.text.apply(removeURLandEmoji)
df

Unnamed: 0,text,target
0,last summer my granddaughter brooke played in ...,academic interests
1,new delhi uttar pradesh police recruitment boa...,academic interests
2,belgaum karnataka higher education minister c...,academic interests
3,lincoln university has welcomed the governmen...,academic interests
4,new delhi the uttar pradesh public service com...,academic interests
...,...,...
517707,\ndeath stranding 2 has been announced at the ...,video gaming
517708,raj kundra dons multiple hats but one thing th...,video gaming
517709,\nmario rabbids sparks of hope one of the bes...,video gaming
517710,ahmedabad a 38yearold man from adalaj who marr...,video gaming


In [9]:
def remove_Emails_and_Numbers(text):
    email_pattern = r'\S+@\S+\.\S+'
    number_pattern = r'\b\d+\b'

    cleaned_text = re.sub(email_pattern, '', text)
    cleaned_text = re.sub(number_pattern, '', cleaned_text)
    return cleaned_text

df.text = df.text.apply(remove_Emails_and_Numbers)
df

Unnamed: 0,text,target
0,last summer my granddaughter brooke played in ...,academic interests
1,new delhi uttar pradesh police recruitment boa...,academic interests
2,belgaum karnataka higher education minister c...,academic interests
3,lincoln university has welcomed the governmen...,academic interests
4,new delhi the uttar pradesh public service com...,academic interests
...,...,...
517707,\ndeath stranding has been announced at the g...,video gaming
517708,raj kundra dons multiple hats but one thing th...,video gaming
517709,\nmario rabbids sparks of hope one of the bes...,video gaming
517710,ahmedabad a 38yearold man from adalaj who marr...,video gaming


In [10]:
def removeSpecialChar(text):
    pattern = r'[^a-zA-Z\s]'

    cleaned_text = re.sub(pattern, '', text)
    cleaned_text = cleaned_text.replace('\n', ' ') # remove '\n' with ' '
    cleaned_text = cleaned_text.replace('\xa0', ' ') # remove '\xa0' with ' '
    return cleaned_text

df.text = df.text.apply(removeSpecialChar)
df

Unnamed: 0,text,target
0,last summer my granddaughter brooke played in ...,academic interests
1,new delhi uttar pradesh police recruitment boa...,academic interests
2,belgaum karnataka higher education minister c...,academic interests
3,lincoln university has welcomed the governmen...,academic interests
4,new delhi the uttar pradesh public service com...,academic interests
...,...,...
517707,death stranding has been announced at the ga...,video gaming
517708,raj kundra dons multiple hats but one thing th...,video gaming
517709,mario rabbids sparks of hope one of the best...,video gaming
517710,ahmedabad a yearold man from adalaj who marrie...,video gaming


In [11]:
def has_html_elements(text):
    try:
        soup = BeautifulSoup(text, 'html.parser')
        return len(soup.find_all()) > 0
    except:
        return True

df['has_html'] = df['text'].apply(has_html_elements)
df = df[~df['has_html']]
df = df.drop('has_html', axis = 1)
df

Unnamed: 0,text,target
0,last summer my granddaughter brooke played in ...,academic interests
1,new delhi uttar pradesh police recruitment boa...,academic interests
2,belgaum karnataka higher education minister c...,academic interests
3,lincoln university has welcomed the governmen...,academic interests
4,new delhi the uttar pradesh public service com...,academic interests
...,...,...
517707,death stranding has been announced at the ga...,video gaming
517708,raj kundra dons multiple hats but one thing th...,video gaming
517709,mario rabbids sparks of hope one of the best...,video gaming
517710,ahmedabad a yearold man from adalaj who marrie...,video gaming


### Tokenization

In [12]:
def tokens(text):
    return word_tokenize(text)

df['tokens'] = df['text'].apply(tokens)
df.head(10)

Unnamed: 0,text,target,tokens
0,last summer my granddaughter brooke played in ...,academic interests,"[last, summer, my, granddaughter, brooke, play..."
1,new delhi uttar pradesh police recruitment boa...,academic interests,"[new, delhi, uttar, pradesh, police, recruitme..."
2,belgaum karnataka higher education minister c...,academic interests,"[belgaum, karnataka, higher, education, minist..."
3,lincoln university has welcomed the governmen...,academic interests,"[lincoln, university, has, welcomed, the, gove..."
4,new delhi the uttar pradesh public service com...,academic interests,"[new, delhi, the, uttar, pradesh, public, serv..."
5,now im saluting frances myers the teacher that...,academic interests,"[now, im, saluting, frances, myers, the, teach..."
6,odisha public service commission opsc today ac...,academic interests,"[odisha, public, service, commission, opsc, to..."
7,politics as academicians would like to call it...,academic interests,"[politics, as, academicians, would, like, to, ..."
8,cbse board will close the correction window fo...,academic interests,"[cbse, board, will, close, the, correction, wi..."
9,nagpur in what is probably a first in maharash...,academic interests,"[nagpur, in, what, is, probably, a, first, in,..."


### Removing stopwords

In [13]:
# removing stopwords

stops = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [token for token in tokens if token.lower() not in stops]

df['tokens'] = df['tokens'].apply(remove_stopwords)
df

Unnamed: 0,text,target,tokens
0,last summer my granddaughter brooke played in ...,academic interests,"[last, summer, granddaughter, brooke, played, ..."
1,new delhi uttar pradesh police recruitment boa...,academic interests,"[new, delhi, uttar, pradesh, police, recruitme..."
2,belgaum karnataka higher education minister c...,academic interests,"[belgaum, karnataka, higher, education, minist..."
3,lincoln university has welcomed the governmen...,academic interests,"[lincoln, university, welcomed, governments, d..."
4,new delhi the uttar pradesh public service com...,academic interests,"[new, delhi, uttar, pradesh, public, service, ..."
...,...,...,...
517707,death stranding has been announced at the ga...,video gaming,"[death, stranding, announced, game, awards, th..."
517708,raj kundra dons multiple hats but one thing th...,video gaming,"[raj, kundra, dons, multiple, hats, one, thing..."
517709,mario rabbids sparks of hope one of the best...,video gaming,"[mario, rabbids, sparks, hope, one, best, nint..."
517710,ahmedabad a yearold man from adalaj who marrie...,video gaming,"[ahmedabad, yearold, man, adalaj, married, wid..."


### Lemmatization

In [14]:
# WORDNET LEMMATIZER (with appropriate pos tags)

nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()

def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def lemmatize_text(tokens):
    pos_tagged = nltk.pos_tag(tokens)
    wordnet_tagged = [(word, pos_tagger(tag)) for word, tag in pos_tagged]
    lemmatized = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized.append(word)
        else:
            lemmatized.append(lemmatizer.lemmatize(word, tag))
    return lemmatized

df['tokens'] = df['tokens'].apply(lemmatize_text)
df

Unnamed: 0,text,target,tokens
0,last summer my granddaughter brooke played in ...,academic interests,"[last, summer, granddaughter, brooke, play, us..."
1,new delhi uttar pradesh police recruitment boa...,academic interests,"[new, delhi, uttar, pradesh, police, recruitme..."
2,belgaum karnataka higher education minister c...,academic interests,"[belgaum, karnataka, high, education, minister..."
3,lincoln university has welcomed the governmen...,academic interests,"[lincoln, university, welcome, government, dec..."
4,new delhi the uttar pradesh public service com...,academic interests,"[new, delhi, uttar, pradesh, public, service, ..."
...,...,...,...
517707,death stranding has been announced at the ga...,video gaming,"[death, stranding, announce, game, award, thou..."
517708,raj kundra dons multiple hats but one thing th...,video gaming,"[raj, kundra, don, multiple, hat, one, thing, ..."
517709,mario rabbids sparks of hope one of the best...,video gaming,"[mario, rabbids, spark, hope, one, best, ninte..."
517710,ahmedabad a yearold man from adalaj who marrie...,video gaming,"[ahmedabad, yearold, man, adalaj, married, wid..."


In [15]:
df.to_csv('preprocessed_sample_df.csv', index = False)