# Importing Libraries

In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import contractions

# Loading Dataset

In [63]:
df = pd.read_csv('tweets.csv')
df.head()

Unnamed: 0,id,keyword,location,text,target
0,0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Stones were pelted on Muslims' houses and some houses and vehicles were set ablaze…",1
1,1,ablaze,,"Telangana: Section 144 has been imposed in Bhainsa from January 13 to 15, after clash erupted between two groups on January 12. Po…",1
2,2,ablaze,New York City,Arsonist sets cars ablaze at dealership https://t.co/gOQvyJbpVI,1
3,3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https://t.co/0gL7NUCPlb https://t.co/u1CcBhOWh9,1
4,4,ablaze,,"""Lord Jesus, your love brings freedom and pardon. Fill me with your Holy Spirit and set my heart ablaze with your l… https://t.co/VlTznnPNi8",0


# Basic Information

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11370 entries, 0 to 11369
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        11370 non-null  int64 
 1   keyword   11370 non-null  object
 2   location  7952 non-null   object
 3   text      11370 non-null  object
 4   target    11370 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 444.3+ KB


In [65]:
df.describe()

Unnamed: 0,id,target
count,11370.0,11370.0
mean,5684.5,0.185928
std,3282.380615,0.389066
min,0.0,0.0
25%,2842.25,0.0
50%,5684.5,0.0
75%,8526.75,0.0
max,11369.0,1.0


In [66]:
df.describe(exclude='number')

Unnamed: 0,keyword,location,text
count,11370,7952,11370
unique,219,4504,11223
top,thunderstorm,United States,I want to help you with my project to save the Caribbean Sea from floods and hurricanes https://t.co/qD8Om9NqQK
freq,93,96,3


In [67]:
# Getting numerical columns
df.select_dtypes('number').columns

Index(['id', 'target'], dtype='object')

In [68]:
# Getting Categorical Columns
df.select_dtypes(exclude='number').columns

Index(['keyword', 'location', 'text'], dtype='object')

In [69]:
# Knowing Shape of the Dataset
df.shape

(11370, 5)

# Dropping unwanted columns

In [70]:
df.drop('id', axis=1, inplace=True)

In [71]:
df.head()

Unnamed: 0,keyword,location,text,target
0,ablaze,,"Communal violence in Bhainsa, Telangana. ""Stones were pelted on Muslims' houses and some houses and vehicles were set ablaze…",1
1,ablaze,,"Telangana: Section 144 has been imposed in Bhainsa from January 13 to 15, after clash erupted between two groups on January 12. Po…",1
2,ablaze,New York City,Arsonist sets cars ablaze at dealership https://t.co/gOQvyJbpVI,1
3,ablaze,"Morgantown, WV",Arsonist sets cars ablaze at dealership https://t.co/0gL7NUCPlb https://t.co/u1CcBhOWh9,1
4,ablaze,,"""Lord Jesus, your love brings freedom and pardon. Fill me with your Holy Spirit and set my heart ablaze with your l… https://t.co/VlTznnPNi8",0


# Handling Text Data

In [72]:
df['text'].head()

0    Communal violence in Bhainsa, Telangana. "Stones were pelted on Muslims' houses and some houses and vehicles were set ablaze…               
1    Telangana: Section 144 has been imposed in Bhainsa from January 13 to 15, after clash erupted between two groups on January 12. Po…         
2    Arsonist sets cars ablaze at dealership https://t.co/gOQvyJbpVI                                                                             
3    Arsonist sets cars ablaze at dealership https://t.co/0gL7NUCPlb https://t.co/u1CcBhOWh9                                                     
4    "Lord Jesus, your love brings freedom and pardon. Fill me with your Holy Spirit and set my heart ablaze with your l… https://t.co/VlTznnPNi8
Name: text, dtype: object

In [73]:
pd.set_option('display.max_colwidth', 1)

In [74]:
df = df[['text', 'target']]
df.head()

Unnamed: 0,text,target
0,"Communal violence in Bhainsa, Telangana. ""Stones were pelted on Muslims' houses and some houses and vehicles were set ablaze…",1
1,"Telangana: Section 144 has been imposed in Bhainsa from January 13 to 15, after clash erupted between two groups on January 12. Po…",1
2,Arsonist sets cars ablaze at dealership https://t.co/gOQvyJbpVI,1
3,Arsonist sets cars ablaze at dealership https://t.co/0gL7NUCPlb https://t.co/u1CcBhOWh9,1
4,"""Lord Jesus, your love brings freedom and pardon. Fill me with your Holy Spirit and set my heart ablaze with your l… https://t.co/VlTznnPNi8",0


In [75]:
df = df.drop_duplicates(subset='text', keep='first')
df.shape

(11223, 2)

In [76]:
df.isna().sum()

text      0
target    0
dtype: int64

# Text Cleaning

#### Removing URL's

In [77]:

def remove_urls(text):
    # Regular expression pattern to match URLs
    url_pattern = r'http[s]?://\S+|www\.\S+'
    return re.sub(url_pattern, '', text)

df['text'] = df['text'].apply(remove_urls)

#### Lower Casing Text

In [78]:
# Way 1
df['text'] = df['text'].str.lower()

# Way 2
# df['text'] = df['text'].apply(str.lower)

# Way 3
# df['text'] = df['text'].map(str.lower)

# Way 4
# df['text'] = df['text'].apply(lambda x: x.lower())

df.head(5)

Unnamed: 0,text,target
0,"communal violence in bhainsa, telangana. ""stones were pelted on muslims' houses and some houses and vehicles were set ablaze…",1
1,"telangana: section 144 has been imposed in bhainsa from january 13 to 15, after clash erupted between two groups on january 12. po…",1
2,arsonist sets cars ablaze at dealership,1
3,arsonist sets cars ablaze at dealership,1
4,"""lord jesus, your love brings freedom and pardon. fill me with your holy spirit and set my heart ablaze with your l…",0


#### Removing Numbers

In [79]:
df['text'] = df['text'].str.replace(r'[0-9]', '', regex=True)
df.head(5)

Unnamed: 0,text,target
0,"communal violence in bhainsa, telangana. ""stones were pelted on muslims' houses and some houses and vehicles were set ablaze…",1
1,"telangana: section has been imposed in bhainsa from january to , after clash erupted between two groups on january . po…",1
2,arsonist sets cars ablaze at dealership,1
3,arsonist sets cars ablaze at dealership,1
4,"""lord jesus, your love brings freedom and pardon. fill me with your holy spirit and set my heart ablaze with your l…",0


#### Changing Contractions

In [80]:
import contractions

def expand_contractions(text):
    return contractions.fix(text)


df['text'] = df['text'].apply(expand_contractions)
df.head()

Unnamed: 0,text,target
0,"communal violence in bhainsa, telangana. ""stones were pelted on muslims' houses and some houses and vehicles were set ablaze…",1
1,"telangana: section has been imposed in bhainsa from january to , after clash erupted between two groups on january . po…",1
2,arsonist sets cars ablaze at dealership,1
3,arsonist sets cars ablaze at dealership,1
4,"""lord jesus, your love brings freedom and pardon. fill me with your holy spirit and set my heart ablaze with your l…",0


#### Punctuation & Special Characters Removal

In [81]:
# Way 1
df['text'] = df['text'].str.replace(r'[^\w\s]', '', regex=True)
df.head(5)

Unnamed: 0,text,target
0,communal violence in bhainsa telangana stones were pelted on muslims houses and some houses and vehicles were set ablaze,1
1,telangana section has been imposed in bhainsa from january to after clash erupted between two groups on january po,1
2,arsonist sets cars ablaze at dealership,1
3,arsonist sets cars ablaze at dealership,1
4,lord jesus your love brings freedom and pardon fill me with your holy spirit and set my heart ablaze with your l,0


# Tokenization

#### Word-Level Tokenization

In [82]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [83]:
nltk.download('punkt_tab')
df['word_token'] = df['text'].apply(word_tokenize)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [84]:
df.head()

Unnamed: 0,text,target,word_token
0,communal violence in bhainsa telangana stones were pelted on muslims houses and some houses and vehicles were set ablaze,1,"[communal, violence, in, bhainsa, telangana, stones, were, pelted, on, muslims, houses, and, some, houses, and, vehicles, were, set, ablaze]"
1,telangana section has been imposed in bhainsa from january to after clash erupted between two groups on january po,1,"[telangana, section, has, been, imposed, in, bhainsa, from, january, to, after, clash, erupted, between, two, groups, on, january, po]"
2,arsonist sets cars ablaze at dealership,1,"[arsonist, sets, cars, ablaze, at, dealership]"
3,arsonist sets cars ablaze at dealership,1,"[arsonist, sets, cars, ablaze, at, dealership]"
4,lord jesus your love brings freedom and pardon fill me with your holy spirit and set my heart ablaze with your l,0,"[lord, jesus, your, love, brings, freedom, and, pardon, fill, me, with, your, holy, spirit, and, set, my, heart, ablaze, with, your, l]"


#### Sentence-Level Tokenization

In [85]:

# df['sent_token'] = df['text'].apply(sent_tokenize)
# df.head()

#### Stop words Removal

In [86]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [87]:
stop_words = set(stopwords.words('english'))

def remove_stop_words(sentence):
    return ' '.join([word for word in sentence.split() if word.lower() not in stop_words])

df['Cleaned_Text'] = df['text'].apply(remove_stop_words)
df.head()

Unnamed: 0,text,target,word_token,Cleaned_Text
0,communal violence in bhainsa telangana stones were pelted on muslims houses and some houses and vehicles were set ablaze,1,"[communal, violence, in, bhainsa, telangana, stones, were, pelted, on, muslims, houses, and, some, houses, and, vehicles, were, set, ablaze]",communal violence bhainsa telangana stones pelted muslims houses houses vehicles set ablaze
1,telangana section has been imposed in bhainsa from january to after clash erupted between two groups on january po,1,"[telangana, section, has, been, imposed, in, bhainsa, from, january, to, after, clash, erupted, between, two, groups, on, january, po]",telangana section imposed bhainsa january clash erupted two groups january po
2,arsonist sets cars ablaze at dealership,1,"[arsonist, sets, cars, ablaze, at, dealership]",arsonist sets cars ablaze dealership
3,arsonist sets cars ablaze at dealership,1,"[arsonist, sets, cars, ablaze, at, dealership]",arsonist sets cars ablaze dealership
4,lord jesus your love brings freedom and pardon fill me with your holy spirit and set my heart ablaze with your l,0,"[lord, jesus, your, love, brings, freedom, and, pardon, fill, me, with, your, holy, spirit, and, set, my, heart, ablaze, with, your, l]",lord jesus love brings freedom pardon fill holy spirit set heart ablaze l


#### Lemmatization

In [88]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [89]:
lemmatizer = WordNetLemmatizer()

def lemmatize_words(sentence):
    words = nltk.word_tokenize(sentence)  # Tokenize the sentence into words
    return ' '.join([lemmatizer.lemmatize(word) for word in words])  # Lemmatize each word and join them back into a string

df['Lemmatized_Text'] = df['Cleaned_Text'].apply(lemmatize_words)
df.head()

Unnamed: 0,text,target,word_token,Cleaned_Text,Lemmatized_Text
0,communal violence in bhainsa telangana stones were pelted on muslims houses and some houses and vehicles were set ablaze,1,"[communal, violence, in, bhainsa, telangana, stones, were, pelted, on, muslims, houses, and, some, houses, and, vehicles, were, set, ablaze]",communal violence bhainsa telangana stones pelted muslims houses houses vehicles set ablaze,communal violence bhainsa telangana stone pelted muslim house house vehicle set ablaze
1,telangana section has been imposed in bhainsa from january to after clash erupted between two groups on january po,1,"[telangana, section, has, been, imposed, in, bhainsa, from, january, to, after, clash, erupted, between, two, groups, on, january, po]",telangana section imposed bhainsa january clash erupted two groups january po,telangana section imposed bhainsa january clash erupted two group january po
2,arsonist sets cars ablaze at dealership,1,"[arsonist, sets, cars, ablaze, at, dealership]",arsonist sets cars ablaze dealership,arsonist set car ablaze dealership
3,arsonist sets cars ablaze at dealership,1,"[arsonist, sets, cars, ablaze, at, dealership]",arsonist sets cars ablaze dealership,arsonist set car ablaze dealership
4,lord jesus your love brings freedom and pardon fill me with your holy spirit and set my heart ablaze with your l,0,"[lord, jesus, your, love, brings, freedom, and, pardon, fill, me, with, your, holy, spirit, and, set, my, heart, ablaze, with, your, l]",lord jesus love brings freedom pardon fill holy spirit set heart ablaze l,lord jesus love brings freedom pardon fill holy spirit set heart ablaze l


#### POS Tagging

In [90]:

nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [91]:
def pos_tag_sentence(sentence):
    words = nltk.word_tokenize(sentence)  # Tokenize the sentence into words
    return nltk.pos_tag(words)  # Perform POS tagging

# Apply the function to the 'Text' column
df['POS_Tags'] = df['Lemmatized_Text'].apply(pos_tag_sentence)
df.head()

Unnamed: 0,text,target,word_token,Cleaned_Text,Lemmatized_Text,POS_Tags
0,communal violence in bhainsa telangana stones were pelted on muslims houses and some houses and vehicles were set ablaze,1,"[communal, violence, in, bhainsa, telangana, stones, were, pelted, on, muslims, houses, and, some, houses, and, vehicles, were, set, ablaze]",communal violence bhainsa telangana stones pelted muslims houses houses vehicles set ablaze,communal violence bhainsa telangana stone pelted muslim house house vehicle set ablaze,"[(communal, JJ), (violence, NN), (bhainsa, NN), (telangana, NN), (stone, NN), (pelted, VBD), (muslim, JJ), (house, NN), (house, NN), (vehicle, NN), (set, VBN), (ablaze, NN)]"
1,telangana section has been imposed in bhainsa from january to after clash erupted between two groups on january po,1,"[telangana, section, has, been, imposed, in, bhainsa, from, january, to, after, clash, erupted, between, two, groups, on, january, po]",telangana section imposed bhainsa january clash erupted two groups january po,telangana section imposed bhainsa january clash erupted two group january po,"[(telangana, JJ), (section, NN), (imposed, VBN), (bhainsa, IN), (january, JJ), (clash, NN), (erupted, VBD), (two, CD), (group, NN), (january, JJ), (po, NN)]"
2,arsonist sets cars ablaze at dealership,1,"[arsonist, sets, cars, ablaze, at, dealership]",arsonist sets cars ablaze dealership,arsonist set car ablaze dealership,"[(arsonist, NN), (set, NN), (car, NN), (ablaze, NN), (dealership, NN)]"
3,arsonist sets cars ablaze at dealership,1,"[arsonist, sets, cars, ablaze, at, dealership]",arsonist sets cars ablaze dealership,arsonist set car ablaze dealership,"[(arsonist, NN), (set, NN), (car, NN), (ablaze, NN), (dealership, NN)]"
4,lord jesus your love brings freedom and pardon fill me with your holy spirit and set my heart ablaze with your l,0,"[lord, jesus, your, love, brings, freedom, and, pardon, fill, me, with, your, holy, spirit, and, set, my, heart, ablaze, with, your, l]",lord jesus love brings freedom pardon fill holy spirit set heart ablaze l,lord jesus love brings freedom pardon fill holy spirit set heart ablaze l,"[(lord, NN), (jesus, NN), (love, VBP), (brings, NNS), (freedom, NN), (pardon, NN), (fill, NN), (holy, JJ), (spirit, NN), (set, VBN), (heart, NN), (ablaze, NN), (l, NN)]"


#### Named Entity Recognition

In [92]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [93]:
def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

df['Named_Entities'] = df['Lemmatized_Text'].apply(extract_entities)
df.head()

Unnamed: 0,text,target,word_token,Cleaned_Text,Lemmatized_Text,POS_Tags,Named_Entities
0,communal violence in bhainsa telangana stones were pelted on muslims houses and some houses and vehicles were set ablaze,1,"[communal, violence, in, bhainsa, telangana, stones, were, pelted, on, muslims, houses, and, some, houses, and, vehicles, were, set, ablaze]",communal violence bhainsa telangana stones pelted muslims houses houses vehicles set ablaze,communal violence bhainsa telangana stone pelted muslim house house vehicle set ablaze,"[(communal, JJ), (violence, NN), (bhainsa, NN), (telangana, NN), (stone, NN), (pelted, VBD), (muslim, JJ), (house, NN), (house, NN), (vehicle, NN), (set, VBN), (ablaze, NN)]","[(bhainsa telangana stone, PERSON), (muslim house, ORG)]"
1,telangana section has been imposed in bhainsa from january to after clash erupted between two groups on january po,1,"[telangana, section, has, been, imposed, in, bhainsa, from, january, to, after, clash, erupted, between, two, groups, on, january, po]",telangana section imposed bhainsa january clash erupted two groups january po,telangana section imposed bhainsa january clash erupted two group january po,"[(telangana, JJ), (section, NN), (imposed, VBN), (bhainsa, IN), (january, JJ), (clash, NN), (erupted, VBD), (two, CD), (group, NN), (january, JJ), (po, NN)]","[(telangana section, PERSON), (january, DATE), (two, CARDINAL), (january, DATE)]"
2,arsonist sets cars ablaze at dealership,1,"[arsonist, sets, cars, ablaze, at, dealership]",arsonist sets cars ablaze dealership,arsonist set car ablaze dealership,"[(arsonist, NN), (set, NN), (car, NN), (ablaze, NN), (dealership, NN)]",[]
3,arsonist sets cars ablaze at dealership,1,"[arsonist, sets, cars, ablaze, at, dealership]",arsonist sets cars ablaze dealership,arsonist set car ablaze dealership,"[(arsonist, NN), (set, NN), (car, NN), (ablaze, NN), (dealership, NN)]",[]
4,lord jesus your love brings freedom and pardon fill me with your holy spirit and set my heart ablaze with your l,0,"[lord, jesus, your, love, brings, freedom, and, pardon, fill, me, with, your, holy, spirit, and, set, my, heart, ablaze, with, your, l]",lord jesus love brings freedom pardon fill holy spirit set heart ablaze l,lord jesus love brings freedom pardon fill holy spirit set heart ablaze l,"[(lord, NN), (jesus, NN), (love, VBP), (brings, NNS), (freedom, NN), (pardon, NN), (fill, NN), (holy, JJ), (spirit, NN), (set, VBN), (heart, NN), (ablaze, NN), (l, NN)]","[(jesus, PERSON)]"
