In [1]:
# author: Shril Kumar [shril.iitdhn@gmail.com]

# Tokenization - process of converting a text into tokens
# Tokens       - words or entities present in the text (entities: Paragraph, Sentences)
# Text object  – a sentence or a phrase or a word or an article

In [2]:
# Text Preprocessing
# 1. Noise Removal - Stopwords, URLs, Punctuations, mentions etc.
# 2. Lexicon Normalization - Tokenization, Normalization, Stemming
# 3. Object Standardization - Regular Expressions, Lookup Tables

In [9]:
# Noise Removal

input_text = "the problem of fake news is important and it should be taken care of as soon as possible"
noise_list = ["is", "a", "the", "and", "of", "it"]

def remove_noise(text):
    words = text.split()
    noise_free_words = [word for word in words if word not in noise_list]
    noise_free_text = " ".join(noise_free_words)
    return noise_free_text

print(remove_noise(input_text))

problem fake news important should be taken care as soon as possible


In [18]:
# Lexicon Normalization Practices
# 1. Stemming - Removing suffixes (“ing”, “ly”, “es”, “s” etc) from a word.
# 2. Lemmatization - step by step procedure of obtaining the root form of the word

from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

from nltk.stem.porter import PorterStemmer 
stem = PorterStemmer()

words = ["running","lying","carrying"]
for w in words:
    print("Lemmatizing: ",  lem.lemmatize(w, "v")) # 2nd argument means POS of word 
    print("Stemming   : ",  stem.stem(w))

Lemmatizing:  run
Stemming   :  run
Lemmatizing:  lie
Stemming   :  lie
Lemmatizing:  carry
Stemming   :  carri


In [20]:
# Object Standardization
# Text data often contains words or phrases which are not present in any standard lexical dictionaries.
# Some of the examples are – acronyms, hashtags with attached words, and colloquial slangs.

lookup_dict = {'rt':'Retweet', 'dm':'direct message', 'awsm' : 'awesome', 'luv' :'love'}

def lookup_words(text):
    words = text.split()
    new_words = []
    for word in words:
        if word.lower() in lookup_dict:
            word = lookup_dict[word.lower()]
        new_words.append(word)
    new_text = " ".join(new_words)
    return new_text

print(lookup_words("DM me this is a rt tweet by Shril"))

direct message me this is a Retweet tweet by Shril


In [29]:
# Text to Features
# POS = Parts of Speech

from nltk import word_tokenize, pos_tag

text = "I am learning Natural Language Processing on Analytics Vidhya"
tokens = word_tokenize(text)

for pos in pos_tag(tokens):
    print(pos)

('I', 'PRP')
('am', 'VBP')
('learning', 'VBG')
('Natural', 'NNP')
('Language', 'NNP')
('Processing', 'NNP')
('on', 'IN')
('Analytics', 'NNP')
('Vidhya', 'NNP')


In [39]:
# Entity Detection Methods
# 1. Named Entity Recognition (NER)
# 2. Topic Modeling

# Following is the code to implement topic modeling using LDA in python

doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father."
doc2 = "My father spends a lot of time driving my sister around to dance practice."
doc3 = "Doctors suggest that driving may cause increased stress and blood pressure."
doc_complete = [doc1, doc2, doc3]

doc_clean = [doc.split() for doc in doc_complete]

import gensim
import gensim.corpora as corpora

# Creating the term dictionary of our corpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# Running and Training LDA model on the document term matrix
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)

print(ldamodel.print_topics())

[(0, '0.029*"driving" + 0.029*"father" + 0.029*"practice." + 0.029*"lot" + 0.029*"time" + 0.029*"spends" + 0.029*"of" + 0.029*"a" + 0.029*"dance" + 0.029*"around"'), (1, '0.060*"driving" + 0.060*"cause" + 0.060*"Doctors" + 0.060*"and" + 0.060*"that" + 0.060*"blood" + 0.060*"increased" + 0.060*"stress" + 0.060*"pressure." + 0.060*"may"'), (2, '0.083*"to" + 0.058*"My" + 0.058*"sister" + 0.058*"my" + 0.033*"is" + 0.033*"sugar," + 0.033*"not" + 0.033*"Sugar" + 0.033*"consume." + 0.033*"but"')]


In [40]:
# N-Grams as Features

def generate_ngrams(text, n):
    words = text.split()
    output = []  
    for i in range(len(words)-n+1):
        output.append(words[i:i+n])
    return output

print(generate_ngrams('this is a sample text', 2))

[['this', 'is'], ['is', 'a'], ['a', 'sample'], ['sample', 'text']]


In [43]:
# Statistical Features
# TF-IDF Link: https://www.analyticsvidhya.com/blog/2015/04/information-retrieval-system-explained/

from sklearn.feature_extraction.text import TfidfVectorizer

obj = TfidfVectorizer()
corpus = ['This is sample document.', 'another random document.', 'third sample document text']
X = obj.fit_transform(corpus)

print(X)

# The model creates a vocabulary dictionary and assigns an index to each word.
# Each row in the output contains a tuple (i,j) and a tf-idf value of word at index j in document i.

  (0, 7)	0.58448290102
  (0, 2)	0.58448290102
  (0, 4)	0.444514311537
  (0, 1)	0.345205016865
  (1, 1)	0.385371627466
  (1, 0)	0.652490884513
  (1, 3)	0.652490884513
  (2, 4)	0.444514311537
  (2, 1)	0.345205016865
  (2, 6)	0.58448290102
  (2, 5)	0.58448290102


In [44]:
# Text Classification

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn import svm

training_corpus = [
                   ('I am exhausted of this work.', 'Class_B'),
                   ("I can't cooperate with this", 'Class_B'),
                   ('He is my badest enemy!', 'Class_B'),
                   ('My management is poor.', 'Class_B'),
                   ('I love this burger.', 'Class_A'),
                   ('This is an brilliant place!', 'Class_A'),
                   ('I feel very good about these dates.', 'Class_A'),
                   ('This is my best work.', 'Class_A'),
                   ("What an awesome view", 'Class_A'),
                   ('I do not like this dish', 'Class_B')
                  ]
test_corpus =  [
                ("I am not feeling well today.", 'Class_B'), 
                ("I feel brilliant!", 'Class_A'), 
                ('Gary is a friend of mine.', 'Class_A'), 
                ("I can't believe I'm doing this.", 'Class_B'), 
                ('The date was good.', 'Class_A'), ('I do not enjoy my job', 'Class_B')
               ]


train_data = []
train_labels = []

for row in training_corpus:
    train_data.append(row[0])
    train_labels.append(row[1])
    
test_data = [] 
test_labels = []

for row in test_corpus:
    test_data.append(row[0]) 
    test_labels.append(row[1])
    
# Create feature vectors
vectorizer = TfidfVectorizer()
# Train the feature vectors
train_vectors = vectorizer.fit_transform(train_data)
# Apply model on test data 
test_vectors = vectorizer.transform(test_data)

# Perform classification with SVM, kernel=linear 
model = svm.SVC(kernel='linear') 
model.fit(train_vectors, train_labels)

prediction = model.predict(test_vectors)
print(classification_report(test_labels, prediction))

             precision    recall  f1-score   support

    Class_A       1.00      0.67      0.80         3
    Class_B       0.75      1.00      0.86         3

avg / total       0.88      0.83      0.83         6

