In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import warnings
import re
import string
warnings.filterwarnings("ignore")
import spacy
nlp = spacy.load('en_core_web_sm')
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop= stopwords.words('english')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemm= WordNetLemmatizer()
from textblob import TextBlob
import bs4
import requests
from spacy import displacy
from spacy.matcher import Matcher 
from spacy.tokens import Span 
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline

In [None]:
%%time
#using pandas to read the train and test file 
train= pd.read_csv('/kaggle/input/twitter-sentiment-analysis-hatred-speech/train.csv', encoding="ISO-8859-1")
test=pd.read_csv('/kaggle/input/twitter-sentiment-analysis-hatred-speech/test.csv', encoding="ISO-8859-1")
df= pd.concat([train,test])
print("Train data frame shape:",train.shape)
print("Test data frame shape:",test.shape)
print("Complete data frame shape:",df.shape)

In [None]:
df.head()

In [None]:
train["label"].value_counts() #we have an imbalance class problem 

# Text Cleaning

I am going cover text cleaning teachniques one by one in detail

The first step of text analytics is to clean text from noises. You may be thinking what is noise in text. A simple definition of noise is anything that will be not useful in our analysis is noise. If I try to classify noise then it can be of 4 types:
1. Common Entities- Things like stopwords(is, the etc.), URLs, Hashtags, Punctuations, Numbers etc.
2. Slangs- Commonly used words that are not part of dictionaries
3. Grammatical and spelling errors
4. Keyword Variations

Text cleaning not only helps us get rid of noise or repetitive information but also reduces dimension of data and makes machine learning model simpler.
We will now remove noise one by one. I will show you how you can clean text at once by creating user defined functions.

**Maintain uniformity**
1. *Fixing encoding*- If you would have noticed we had already imported and converted data in 'ISO-8859-1' because we are dealing with tweets here. Each language has its own encoding such as 'ASCII' for English, 'BIG5' for Chinese and 'LATIN' for West Europe. It is always a good practice to convert them into a standard and unique format. One of the commonly used format is 'UTF8'.
2. *Change casing*- To maintain uniformity, it always advisable to convert the text into lower case. 

In [None]:
df['tweet']= df["tweet"].str.lower()
df.head()

3. Removal of HTML Noise/characters- 

If you are extracting from web, it might contain some HTML noise or tags such <,> etc.

In [None]:
df['tweet']=df['tweet'].str.replace("<[^<]+?>","",regex=True)
df.head()

4. Contractions

Because of character limit in Twitter, people often use contracted form of word to fit more characters. I have managed to get a dictionary of such word. Let's look up and normalize

In [None]:
contractions = { 
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"i'd": "i had / i would",
"i'd've": "i would have",
"i'll": "i shall / I will",
"i'll've": "i shall have / i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",  
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

cont_re = re.compile('(%s)' % '|'.join(contractions.keys()))
def expand(s, contractions = contractions):
    def replace(match):
        return contractions[match.group(0)]
    return cont_re.sub(replace, s)
df['tweet'] =df['tweet'].apply(expand)
df.head()

5. Removal of URLs, Hashtag, Mentions-

URLs, punctuations, new line characters etc. are very common in text data. If we doing any analysis, it is rare to have imporatnce of these characters

In [None]:
#Remove http and url
df['tweet']=df['tweet'].str.replace('https?://\S+|www\.\S+', '',regex=True)
#Remove punctuations and 
df['tweet']=df['tweet'].str.replace('[%s]' % re.escape(string.punctuation), '', regex=True)
#lets remove new line characters if any
df['tweet']=df['tweet'].str.replace('\n', '', regex=True)
df.head()

6. Remove Emojis

Emojis are common in tweets or any social media data. We can remove them or analyze separately.

In [None]:
#for this analysis, I am removing the emojis
def emojis(text):
    emoji = re.compile("["u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji.sub(r'', text)

df['tweet']=df['tweet'].apply(lambda x: emojis(x))
df.head()

7. Stopwords Removal

Stopwords are common and frequently used words in sentence. They add very little value to the analysis and increase dimensionality of the data 

In [None]:
df['tweet'].apply(lambda x: [item for item in x if item not in stop])
df.head()

8. Normalization (Lemmatization)

Inflectional form of word such better, best for good are often found in text data. They increases the dimensionality of data and little value 

In [None]:
def lemmatization(df):
    df['tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

df.head()

9. Additional Repetitive word

We know from the problem statement that names have been replaced by word "user". Therefore it will right exclude them as well 

In [None]:
df['tweet']=df['tweet'].str.replace('user', '',regex=True)
df.head()

In [None]:
#Restricting graph to 5000 tweets only
tweets=df['tweet'][:5000]

In the term of semantic web, a sentence consist of subject, predicate and object. It is also known as triples. With below code, we going to ectract subject and object from sentences and form a knowledge graph. If you want to read more about the below codes 
please click attached [link](https://www.analyticsvidhya.com/blog/2019/10/how-to-build-knowledge-graph-text-using-spacy/)- An article by Prateek Joshi

In [None]:
def get_entities(sent):
  ## chunk 1
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  prefix = ""
  modifier = ""
  
  for tok in nlp(sent):
    ## chunk 2
    # if token is a punctuation mark then move on to the next token
    if tok.dep_ != "punct":
      # check: token is a compound word or not
      if tok.dep_ == "compound":
        prefix = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      ## chunk 3
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      ## chunk 4
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      ## chunk 5  
      # update variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text


  return [ent1.strip(), ent2.strip()]

In [None]:
entity_pairs = []

for i in tqdm(tweets):
  entity_pairs.append(get_entities(i))

entity_pairs[10:20]

Similar to subject and object, let's extract predicates as well

In [None]:
def predicates(sent):
    doc = nlp(sent)
    # Matcher class object 
    matcher = Matcher(nlp.vocab)

    #define the pattern 
    pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

    matcher.add("matching_1", None, pattern) 

    matches = matcher(doc)
    k = len(matches) - 1

    span = doc[matches[k][1]:matches[k][2]] 

    return(span.text)

In [None]:
predicates = [predicates(i) for i in tqdm(tweets)]

In [None]:
pd.Series(predicates).value_counts()[:10]

Let's now create a data frame with subject predicate and object

In [None]:
# extract subject
subject = [i[0] for i in entity_pairs]

# extract object
object1 = [i[1] for i in entity_pairs]

graph_df = pd.DataFrame({'subject':subject,  'predicate':predicates, 'object':object1})

We will now create a multi directed knowledge graph using networkx library. If you want to read more about the library, please click the below link

https://networkx.github.io/

In [None]:
# create a directed-graph from a dataframe
graph=nx.from_pandas_edgelist(graph_df, "subject", "object", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

In [None]:
#Lets plot it using plotly
plt.figure(figsize=(12,12))

pos = nx.spring_layout(graph)
nx.draw(graph, with_labels=True, node_color='Cyan', edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

Let's perform some graph analytics and what people are reffering to as "love" in the dataset 

In [None]:
graph=nx.from_pandas_edgelist(graph_df[graph_df['predicate']=="love"], "subject", "object", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

plt.figure(figsize=(12,12))
pos = nx.spring_layout(graph, k = 0.5) # k regulates the distance between nodes
nx.draw(graph, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

I hope it was helpful. Please upvote for support.
Also, refer to my other notebooks if you are looking for word embedding techniques Word2Vec, GloVe etc. and text classification.