# Natural Language Processing

# Loading Dataset

In [1]:
import numpy as np
import pandas as pd
import nltk

In [2]:
df = pd.read_csv('tweets.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Ston...",1
1,1,ablaze,,Telangana: Section 144 has been imposed in Bha...,1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https:...,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https:...,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pard...",0


In [3]:
df = df.drop(['id', 'keyword', 'location'], axis=1)

In [4]:
# Getting Value Counts

df['target'].value_counts()

target
0    9256
1    2114
Name: count, dtype: int64

In [5]:
pd.set_option('display.max_colwidth', 1)
df.head()

Unnamed: 0,text,target
0,"Communal violence in Bhainsa, Telangana. ""Stones were pelted on Muslims' houses and some houses and vehicles were set ablaze…",1
1,"Telangana: Section 144 has been imposed in Bhainsa from January 13 to 15, after clash erupted between two groups on January 12. Po…",1
2,Arsonist sets cars ablaze at dealership https://t.co/gOQvyJbpVI,1
3,Arsonist sets cars ablaze at dealership https://t.co/0gL7NUCPlb https://t.co/u1CcBhOWh9,1
4,"""Lord Jesus, your love brings freedom and pardon. Fill me with your Holy Spirit and set my heart ablaze with your l… https://t.co/VlTznnPNi8",0


## Lower Casing

In [6]:
method1 = df['text'].str.lower()
method2 = df['text'].apply(str.lower)
method3 = df['text'].apply(lambda x : x.lower())
method4 = df['text'].map(str.lower)

## Removing Punctuations

In [7]:
# Method 1
punc1 = df['text'].str.replace(r'[^\w\s]', '', regex=True)

# Method 2
import string
translator = str.maketrans('', '', string.punctuation)
punc2 = df['text'].apply(lambda x: x.translate(translator))

# Method 3
def remove_punctuation(text):
    return ''.join([char for char in text if char not in string.punctuation])
punc3 = df['text'].apply(remove_punctuation)

# Method 4
import re
def remove_punctuation_with_re(text):
    return re.sub(r'^[\w\s]','', text)
punc4 = df['text'].apply(remove_punctuation_with_re)

# Method 5
def remove_numbers_with_filter(text):
    return ''.join(filter(lambda x: x not in string.punctuation, text))
punc5 = df['text'].apply(remove_numbers_with_filter)

## Removing Numbers

In [8]:
# Method 1
rem_num1 = df['text'].str.replace(r'\d+', '', regex=True)

# Method 2
def remove_numbers_with_isdigits(text):
    return ''.join([char for char in text if not char.isdigit()])
rem_num2 = df['text'].apply(remove_numbers_with_isdigits)

# Method 3
import string
translation_table = str.maketrans('', '', string.digits)
rem_num3 = df['text'].apply(lambda x: x.translate(translation_table))

# Method 4
import re
def remove_numbers_with_re(text):
    return re.sub(r'\d+', '', text)
rem_num4 = df['text'].apply(remove_numbers_with_re)

# Method 5
def remove_numbers_with_filter(text):
    return ''.join(filter(lambda x: not x.isdigit(), text))
rem_num5 = df['text'].apply(remove_numbers_with_filter)

## Removal of Extra Spaces

In [9]:
df['text'].head()

0    Communal violence in Bhainsa, Telangana. "Stones were pelted on Muslims' houses and some houses and vehicles were set ablaze…               
1    Telangana: Section 144 has been imposed in Bhainsa from January 13 to 15, after clash erupted between two groups on January 12. Po…         
2    Arsonist sets cars ablaze at dealership https://t.co/gOQvyJbpVI                                                                             
3    Arsonist sets cars ablaze at dealership https://t.co/0gL7NUCPlb https://t.co/u1CcBhOWh9                                                     
4    "Lord Jesus, your love brings freedom and pardon. Fill me with your Holy Spirit and set my heart ablaze with your l… https://t.co/VlTznnPNi8
Name: text, dtype: object

In [10]:
# Method 1
rem_space1 = df['text'].str.strip().str.replace(r'\s+', '', regex=True)

# Method 2
def remove_extra_spaces(text):
    return ' '.join(text.split())
rem_space2 = df['text'].apply(remove_extra_spaces)

# Method 3
rem_space3 = df['text'].str.replace(r'\s+', '', regex=True)

# Method 4
import re
def remove_extra_spaces_with_re(text):
    return re.sub(r'\s+', ' ', text.strip())

rem_space4 = df['text'].apply(remove_extra_spaces_with_re)

# Method 5
def remove_extra_spaces_filter(text):
    return ' '.join(filter(lambda x: x.strip(), text.split()))
rem_space5 = df['text'].apply(remove_extra_spaces_filter)

## Replacing Repeated Punctuation

In [11]:
df['text'].head()

0    Communal violence in Bhainsa, Telangana. "Stones were pelted on Muslims' houses and some houses and vehicles were set ablaze…               
1    Telangana: Section 144 has been imposed in Bhainsa from January 13 to 15, after clash erupted between two groups on January 12. Po…         
2    Arsonist sets cars ablaze at dealership https://t.co/gOQvyJbpVI                                                                             
3    Arsonist sets cars ablaze at dealership https://t.co/0gL7NUCPlb https://t.co/u1CcBhOWh9                                                     
4    "Lord Jesus, your love brings freedom and pardon. Fill me with your Holy Spirit and set my heart ablaze with your l… https://t.co/VlTznnPNi8
Name: text, dtype: object

In [12]:
# Method 1
punc_rep1 = df['text'].str.replace(r'([!?./\@])\1+', r'\1', regex=True)

# Method 2
import re
def replace_repeated_puncs(text):
    return re.sub(r'([!?/\.])\1+', r'\1', text)
punc_rep2 = df['text'].apply(replace_repeated_puncs)

# Method 3
def remove_repeated_punc_lists(text):
    result = []
    for char in text:
        if result and char ==  result[-1] and char in "!?/\.":
            continue
        result.append(char)
    return ''.join(result)
punc_rep3 = df['text'].apply(remove_repeated_punc_lists)

# Method 4
def replace_repeated_puncs_translate(text):
    punctuations = r'!/\?.@'
    for p in punctuations:
        text = text.replace(p * 2, p)
    return text
punc_rep4 = df['text'].apply(replace_repeated_puncs_translate)

## Removing Emojis

In [13]:
# Dataset containing emojis
df_emoji = pd.DataFrame({
    'text': [
        "Hello there! 😀 How are you? 🤔",
        "I love programming! 💻✨",
        "Let's remove emojis! 🎉🎈",
        "No emojis here."
    ]
})
df

Unnamed: 0,text,target
0,"Communal violence in Bhainsa, Telangana. ""Stones were pelted on Muslims' houses and some houses and vehicles were set ablaze…",1
1,"Telangana: Section 144 has been imposed in Bhainsa from January 13 to 15, after clash erupted between two groups on January 12. Po…",1
2,Arsonist sets cars ablaze at dealership https://t.co/gOQvyJbpVI,1
3,Arsonist sets cars ablaze at dealership https://t.co/0gL7NUCPlb https://t.co/u1CcBhOWh9,1
4,"""Lord Jesus, your love brings freedom and pardon. Fill me with your Holy Spirit and set my heart ablaze with your l… https://t.co/VlTznnPNi8",0
...,...,...
11365,Media should have warned us well in advance. This wrecked my whole night. I refuse to watch…,0
11366,i feel directly attacked 💀 i consider moonbin &amp; jinjin as my bias and im currently wrecked by rocky i hate this,0
11367,i feel directly attacked 💀 i consider moonbin &amp; jinjin as my bias and im currently wrecked by rocky i hate this https://t.co/psLBecS7hI,0
11368,"ok who remember ""outcast"" nd the ""dora"" au?? THOSE AU WRECKED OUR NERVES ND BRAINCELLS JDKSHSSJHS LEGENDS",0


In [18]:
# pip install clean-text
%pip install clean-text demoji


Collecting demoji
  Downloading demoji-1.1.0-py3-none-any.whl.metadata (9.2 kB)
Downloading demoji-1.1.0-py3-none-any.whl (42 kB)
Installing collected packages: demoji
Successfully installed demoji-1.1.0


In [22]:
# Method 1
from cleantext import clean
emoji_rem1 = df_emoji['text'].apply(lambda x: clean(x, no_emoji=True))

# Method 2
def remove_emojis(text):
    emoji_pattern = re.compile("[\U0001F600-\U0001F64F"  # emoticons
                                 "\U0001F300-\U0001F5FF"  # symbols & pictographs
                                 "\U0001F680-\U0001F6FF"  # transport & map symbols
                                 "\U0001F700-\U0001F77F"  # alchemical symbols
                                 "\U0001F900-\U0001F9FF"  # supplemental symbols and pictographs
                                 "\U00002702-\U000027B0"  # dingbats
                                 "\U000024C2-\U0001F251"  # enclosed characters
                                 "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
emoji_rem2 = df_emoji['text'].apply(remove_emojis)

# Method 3
import demoji
demoji.download_codes()
emoji_rem3 = df_emoji['text'].apply(lambda x: demoji.replace(x, ""))

# Method 4
emoji_rem4 = df_emoji['text'].apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))

  demoji.download_codes()


## Removing Emoticons

In [31]:
# Dataset for Emoticons
df_emoticons = pd.DataFrame({
    'text': [
        "Hello there! :) How are you? :D",
        "I love programming! <3",
        "Let's remove emoticons! :P :o",
        "No emoticons here."
    ]
})
df_emoticons.head()

Unnamed: 0,text
0,Hello there! :) How are you? :D
1,I love programming! <3
2,Let's remove emoticons! :P :o
3,No emoticons here.


In [41]:
# Method 1
def remove_emoticons(text):
    emoticon_pattern = re.compile(r'[:;=][)DdpP\(\[<3]')
    return emoticon_pattern.sub('', text)
emoticon_rem1 = df_emoticons['text'].apply(remove_emoticons)

# Method 2
emoticon_rem2 = df_emoticons['text'].str.replace(r'[:;=][)DdoOpP\(\[<]', '', regex=True)

# Method 3
emoticons = [':)', ':(', ':D', ':P', '<3', ':o']

def remove_custom_emoticons(text):
    for emoticon in emoticons:
        text = text.replace(emoticon, '')
    return text
emoticon_rem3 = df_emoticons['text'].apply(remove_custom_emoticons)

## Handling Contractions

In [42]:
%pip install contractions




In [44]:
# Dataset for contractions
df_con = pd.DataFrame({
    'text': [
        "I'll be there within 5 min.",
        "She'd like to know how I'd done that!",
        "It's awesome to meet new friends.",
        "We've been waiting for this day for so long."
    ]
})

df_con.head()

Unnamed: 0,text
0,I'll be there within 5 min.
1,She'd like to know how I'd done that!
2,It's awesome to meet new friends.
3,We've been waiting for this day for so long.


In [48]:
# Method 1
import contractions
def expand_contractions(text):
    return ' '.join([contractions.fix(word) for word in text.split()])
con1 = df_con['text'].apply(expand_contractions)

# Method 2
contraction_mapping = {
    "I'll": "I will",
    "She'd": "She would",
    "It's": "It is",
    "We've": "We have",
    "I'd": "I would",
    "don't": "do not",
    "can't": "cannot"
}
def expand_contractions_regex(text):
    pattern = re.compile(r'\b(' + '|'.join(contraction_mapping.keys()) + r')\b')
    return pattern.sub(lambda x: contraction_mapping[x.group(0)], text)
con2 = df_con['text'].apply(expand_contractions_regex)

# Method 3
contraction_mapping = {
    "I'll": "I will",
    "She'd": "She would",
    "It's": "It is",
    "We've": "We have",
    "I'd": "I would",
}

# Function to expand contractions using manual mapping
def manual_expand_contractions(text):
    for contraction, expansion in contraction_mapping.items():
        text = text.replace(contraction, expansion)
    return text
con3 = df_con['text'].apply(manual_expand_contractions)

## Finding and Removing Html Tags

In [49]:
# Dataset
df_html = pd.DataFrame({
    'text': [
        "<h1>Hello World!</h1>",
        "<p>This is a <strong>test</strong> string.</p>",
        "<div>Another <em>example</em> with <a href='#'>links</a>.</div>",
        "No HTML tags here."
    ]
})

df_html.head()

Unnamed: 0,text
0,<h1>Hello World!</h1>
1,<p>This is a <strong>test</strong> string.</p>
2,<div>Another <em>example</em> with <a href='#'>links</a>.</div>
3,No HTML tags here.


In [51]:
%pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [55]:
# Method 1
def remove_html_tags(text):
    return re.sub(r'<.*?>', '', text)
html_rem1 = df_html['text'].apply(remove_html_tags)

# Method 2
from bs4 import BeautifulSoup
def remove_html_tags_bs(text):
    return BeautifulSoup(text, "html.parser").get_text()
html_rem2 = df_html['text'].apply(remove_html_tags_bs)


## Finding and Removing URL's