<a href="https://colab.research.google.com/github/rajuiit/NLP-TF-IDF-and-Bigram-Text-Processing/blob/main/nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import nltk
from nltk.util import ngrams
 
# Function to generate n-grams from sentences.
def extract_ngrams(data, num):
    n_grams = ngrams(nltk.word_tokenize(data), num)
    return [ ' '.join(grams) for grams in n_grams]
 
data = 'must be the truth'
 
print("1-gram: ", extract_ngrams(data, 1))
print("2-gram: ", extract_ngrams(data, 2))
print("3-gram: ", extract_ngrams(data, 3))
print("4-gram: ", extract_ngrams(data, 4))

1-gram:  ['must', 'be', 'the', 'truth']
2-gram:  ['must be', 'be the', 'the truth']
3-gram:  ['must be the', 'be the truth']
4-gram:  ['must be the truth']


#TF-IDF

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from collections import Counter
docs = np.array([
    'a a b c',
    'a c c c d e f',
    'a c d d d',
    'a d f'
])

In [None]:
countVectorizer = CountVectorizer(analyzer='char')
tf = countVectorizer.fit_transform(docs).toarray()
tf

array([[3, 2, 1, 1, 0, 0, 0],
       [6, 1, 0, 3, 1, 1, 1],
       [4, 1, 0, 1, 3, 0, 0],
       [2, 1, 0, 0, 1, 0, 1]])

In [None]:
# compute manually the tfidf score for the first document
n_docs = len(docs)
df = np.sum(tf != 0, axis = 0)
idf = np.log(n_docs / df) + 1
tf_idf = tf[2] * idf
print(tf_idf)

[4.         1.         0.         1.28768207 3.86304622 0.
 0.        ]


In [None]:
import pandas as pd
import sklearn as sk
import math
import nltk
from nltk.corpus import stopwords

In [None]:
def computeTF(wordDict, doc):
    """
        tf(t,d) = count of t in d / number of words in d

    :param wordDict:
    :param doc:
    :return:
    """
    tfDict = {}
    corpusCount = len(doc)
    for word, count in wordDict.items():
        tfDict[word] = count/float(corpusCount)
    return(tfDict)

def computeIDF(docList):
    """
        idf(t) = log(N/(df + 1))
    :param docList:
    :return:
    """
    idfDict = {}
    N = len(docList)

    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / (float(val) + 1))

    return (idfDict)

def computeTFIDF(tfBow, idfs):
    """
    tf-idf(t, d) = tf(t, d) * log(N/(df + 1))
    :param tfBow:
    :param idfs:
    :return:
    """
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return(tfidf)

In [None]:
sentence1 = "a a b c"
sentence2 = "a c c c d e f"
sentence3 = "a c d d d"
sentence4 = "a d f"

In [None]:
#split so each word have their own string
sentence1_list = nltk.word_tokenize(sentence1)
sentence2_list = nltk.word_tokenize(sentence2)
sentence3_list = nltk.word_tokenize(sentence3)
sentence4_list = nltk.word_tokenize(sentence4)
total= set(sentence1_list).union(set(sentence2_list)).union(set(sentence3_list)).union(set(sentence4_list))
print (total)

{'e', 'd', 'a', 'f', 'b', 'c'}


In [None]:
def create_word_dict(total, sentence):
    wordDict = dict.fromkeys(total, 0)
    for word in sentence:
        wordDict[word] += 1
    return wordDict

wordDictA = create_word_dict(total,sentence1_list)
wordDictB = create_word_dict(total,sentence2_list)
wordDictC = create_word_dict(total,sentence3_list)
wordDictD = create_word_dict(total,sentence4_list)

print (wordDictA)
print()
print (wordDictB)
print()
print (wordDictC)
print()
print (wordDictD)

{'e': 0, 'd': 0, 'a': 2, 'f': 0, 'b': 1, 'c': 1}

{'e': 1, 'd': 1, 'a': 1, 'f': 1, 'b': 0, 'c': 3}

{'e': 0, 'd': 3, 'a': 1, 'f': 0, 'b': 0, 'c': 1}

{'e': 0, 'd': 1, 'a': 1, 'f': 1, 'b': 0, 'c': 0}


In [None]:
tfFirst = computeTF(wordDictA, sentence1_list)
tfSecond = computeTF(wordDictB, sentence2_list)
tfThird = computeTF(wordDictC, sentence3_list)
tfFour = computeTF(wordDictD, sentence4_list)
print ("TERM FREQUENCY OF SENTENCE1:\n",tfFirst)
print()
print ("TERM FREQUENCY OF SENTENCE2:\n",tfSecond)
print()
print ("TERM FREQUENCY OF SENTENCE3:\n",tfThird)
print()
print ("TERM FREQUENCY OF SENTENCE4:\n",tfFour)

TERM FREQUENCY OF SENTENCE1:
 {'e': 0.0, 'd': 0.0, 'a': 0.5, 'f': 0.0, 'b': 0.25, 'c': 0.25}

TERM FREQUENCY OF SENTENCE2:
 {'e': 0.14285714285714285, 'd': 0.14285714285714285, 'a': 0.14285714285714285, 'f': 0.14285714285714285, 'b': 0.0, 'c': 0.42857142857142855}

TERM FREQUENCY OF SENTENCE3:
 {'e': 0.0, 'd': 0.6, 'a': 0.2, 'f': 0.0, 'b': 0.0, 'c': 0.2}

TERM FREQUENCY OF SENTENCE4:
 {'e': 0.0, 'd': 0.3333333333333333, 'a': 0.3333333333333333, 'f': 0.3333333333333333, 'b': 0.0, 'c': 0.0}


In [None]:
idfs = computeIDF([wordDictA, wordDictB, wordDictC, wordDictD])

idfs

{'a': 0.6020599913279624,
 'b': 0.6020599913279624,
 'c': 0.6020599913279624,
 'd': 0.6020599913279624,
 'e': 0.6020599913279624,
 'f': 0.6020599913279624}

In [None]:
#running our two sentences through the IDF:
idfFirst = computeTFIDF(tfFirst, idfs)
idfSecond = computeTFIDF(tfSecond, idfs)
idfThird = computeTFIDF(tfThird, idfs)
idfFour = computeTFIDF(tfFour, idfs)
print (idfFirst)
print()
print (idfSecond)
print()
print (idfThird)
print()
print (idfFour)

{'e': 0.0, 'd': 0.0, 'a': 0.3010299956639812, 'f': 0.0, 'b': 0.1505149978319906, 'c': 0.1505149978319906}

{'e': 0.08600857018970891, 'd': 0.08600857018970891, 'a': 0.08600857018970891, 'f': 0.08600857018970891, 'b': 0.0, 'c': 0.2580257105691267}

{'e': 0.0, 'd': 0.3612359947967774, 'a': 0.12041199826559248, 'f': 0.0, 'b': 0.0, 'c': 0.12041199826559248}

{'e': 0.0, 'd': 0.20068666377598746, 'a': 0.20068666377598746, 'f': 0.20068666377598746, 'b': 0.0, 'c': 0.0}


In [None]:
#putting it in a dataframe
idf = pd.DataFrame([idfFirst, idfSecond, idfThird,idfFour ])
idf.head()

Unnamed: 0,e,d,a,f,b,c
0,0.0,0.0,0.30103,0.0,0.150515,0.150515
1,0.086009,0.086009,0.086009,0.086009,0.0,0.258026
2,0.0,0.361236,0.120412,0.0,0.0,0.120412
3,0.0,0.200687,0.200687,0.200687,0.0,0.0


#Bi-Gram Text Preprocessing

In [None]:
import pandas as pd

from google.colab import drive
drive.mount('/content/gdrive')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
input_str = open('gdrive/My Drive/Colab Notebooks/NLP/nlp-exercise/Chopin.txt')
input_str = input_str.read()
#input_str = input_str.lower()
print(input_str)

"The Story of An Hour"
Kate Chopin (1894)

Knowing that Mrs. Mallard was afflicted with a heart trouble, great care was taken to break to her as gently as possible the news of her husband's death.
It was her sister Josephine who told her, in broken sentences; veiled hints that revealed in half concealing. Her husband's friend Richards was there, too, near her. It was he who had been in the newspaper office when intelligence of the railroad disaster was received, with Brently Mallard's name leading the list of "killed." He had only taken the time to assure himself of its truth by a second telegram, and had hastened to forestall any less careful, less tender friend in bearing the sad message.

She did not hear the story as many women have heard the same, with a paralyzed inability to accept its significance. She wept at once, with sudden, wild abandonment, in her sister's arms. When the storm of grief had spent itself she went away to her room alone. She would have no one follow her.

Th

In [None]:
import re
#input_str = re.sub(r'\d+', '', input_str)

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 
#def preprocess(sentence):
    #sentence=str(sentence)
    #sentence = sentence.lower()
    #sentence=sentence.replace('{html}',"") 
    #rem_num = re.sub('[0-9]+', '', sentence)
    #tokenizer = RegexpTokenizer(r'\w+')
    #tokens = tokenizer.tokenize(rem_num)  
    #filtered_words = [w for w in rem_num if len(w) > 2 if not w in stopwords.words('english')]
    #stem_words=[stemmer.stem(w) for w in filtered_words]
    #lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    #return rem_num

# = preprocess(input_str);


In [None]:
sentence=str(input_str)
sentence = sentence.lower()

In [None]:
print(sentence)

"the story of an hour"
kate chopin (1894)

knowing that mrs. mallard was afflicted with a heart trouble, great care was taken to break to her as gently as possible the news of her husband's death.
it was her sister josephine who told her, in broken sentences; veiled hints that revealed in half concealing. her husband's friend richards was there, too, near her. it was he who had been in the newspaper office when intelligence of the railroad disaster was received, with brently mallard's name leading the list of "killed." he had only taken the time to assure himself of its truth by a second telegram, and had hastened to forestall any less careful, less tender friend in bearing the sad message.

she did not hear the story as many women have heard the same, with a paralyzed inability to accept its significance. she wept at once, with sudden, wild abandonment, in her sister's arms. when the storm of grief had spent itself she went away to her room alone. she would have no one follow her.

th

In [None]:
def remove_special_character(phrase, remove_number=False):
  """remove_special_character takes text and removes special charcters.
     ref: https://stackoverflow.com/a/18082370/4084039"""

  phrase = re.sub("\S*\d\S*", "", phrase).strip()
  if remove_number:
    phrase = re.sub('[^A-Za-z]+', ' ', phrase)
  else:
    phrase = re.sub('[^A-Za-z0-9]+', ' ', phrase)
  return phrase

processed_text = remove_special_character(sentence, True)
print(processed_text)

 the story of an hour kate chopin knowing that mrs mallard was afflicted with a heart trouble great care was taken to break to her as gently as possible the news of her husband s death it was her sister josephine who told her in broken sentences veiled hints that revealed in half concealing her husband s friend richards was there too near her it was he who had been in the newspaper office when intelligence of the railroad disaster was received with brently mallard s name leading the list of killed he had only taken the time to assure himself of its truth by a second telegram and had hastened to forestall any less careful less tender friend in bearing the sad message she did not hear the story as many women have heard the same with a paralyzed inability to accept its significance she wept at once with sudden wild abandonment in her sister s arms when the storm of grief had spent itself she went away to her room alone she would have no one follow her there stood facing the open window a 

In [None]:
processed_text = re.sub('[0-9]+', '', processed_text)
print(processed_text)

 the story of an hour kate chopin knowing that mrs mallard was afflicted with a heart trouble great care was taken to break to her as gently as possible the news of her husband s death it was her sister josephine who told her in broken sentences veiled hints that revealed in half concealing her husband s friend richards was there too near her it was he who had been in the newspaper office when intelligence of the railroad disaster was received with brently mallard s name leading the list of killed he had only taken the time to assure himself of its truth by a second telegram and had hastened to forestall any less careful less tender friend in bearing the sad message she did not hear the story as many women have heard the same with a paralyzed inability to accept its significance she wept at once with sudden wild abandonment in her sister s arms when the storm of grief had spent itself she went away to her room alone she would have no one follow her there stood facing the open window a 

In [None]:
stopwords= set(['a', 'an', 'and', 'as', 'at', 'for', 'from', 'in', 'into', 'of', 'on', 'or', 'the', 'to'])

# ref: https://gist.github.com/sebleier/554280

processed_text = ' '.join(e.lower() for e in processed_text.split() if e.lower() not in stopwords)
print(processed_text)

story hour kate chopin knowing that mrs mallard was afflicted with heart trouble great care was taken break her gently possible news her husband s death it was her sister josephine who told her broken sentences veiled hints that revealed half concealing her husband s friend richards was there too near her it was he who had been newspaper office when intelligence railroad disaster was received with brently mallard s name leading list killed he had only taken time assure himself its truth by second telegram had hastened forestall any less careful less tender friend bearing sad message she did not hear story many women have heard same with paralyzed inability accept its significance she wept once with sudden wild abandonment her sister s arms when storm grief had spent itself she went away her room alone she would have no one follow her there stood facing open window comfortable roomy armchair this she sank pressed down by physical exhaustion that haunted her body seemed reach her soul sh

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import nltk
from nltk.util import ngrams
 
# Function to generate n-grams from sentences.
def extract_ngrams(data, num):
    n_grams = ngrams(nltk.word_tokenize(data), num)
    return [ ' '.join(grams) for grams in n_grams]
 
print(extract_ngrams(processed_text, 2))

['story hour', 'hour kate', 'kate chopin', 'chopin knowing', 'knowing that', 'that mrs', 'mrs mallard', 'mallard was', 'was afflicted', 'afflicted with', 'with heart', 'heart trouble', 'trouble great', 'great care', 'care was', 'was taken', 'taken break', 'break her', 'her gently', 'gently possible', 'possible news', 'news her', 'her husband', 'husband s', 's death', 'death it', 'it was', 'was her', 'her sister', 'sister josephine', 'josephine who', 'who told', 'told her', 'her broken', 'broken sentences', 'sentences veiled', 'veiled hints', 'hints that', 'that revealed', 'revealed half', 'half concealing', 'concealing her', 'her husband', 'husband s', 's friend', 'friend richards', 'richards was', 'was there', 'there too', 'too near', 'near her', 'her it', 'it was', 'was he', 'he who', 'who had', 'had been', 'been newspaper', 'newspaper office', 'office when', 'when intelligence', 'intelligence railroad', 'railroad disaster', 'disaster was', 'was received', 'received with', 'with bren

In [None]:
from collections import Counter
from nltk.util import ngrams

In [None]:
tokens = nltk.word_tokenize(processed_text)

In [None]:
print(tokens)

['story', 'hour', 'kate', 'chopin', 'knowing', 'that', 'mrs', 'mallard', 'was', 'afflicted', 'with', 'heart', 'trouble', 'great', 'care', 'was', 'taken', 'break', 'her', 'gently', 'possible', 'news', 'her', 'husband', 's', 'death', 'it', 'was', 'her', 'sister', 'josephine', 'who', 'told', 'her', 'broken', 'sentences', 'veiled', 'hints', 'that', 'revealed', 'half', 'concealing', 'her', 'husband', 's', 'friend', 'richards', 'was', 'there', 'too', 'near', 'her', 'it', 'was', 'he', 'who', 'had', 'been', 'newspaper', 'office', 'when', 'intelligence', 'railroad', 'disaster', 'was', 'received', 'with', 'brently', 'mallard', 's', 'name', 'leading', 'list', 'killed', 'he', 'had', 'only', 'taken', 'time', 'assure', 'himself', 'its', 'truth', 'by', 'second', 'telegram', 'had', 'hastened', 'forestall', 'any', 'less', 'careful', 'less', 'tender', 'friend', 'bearing', 'sad', 'message', 'she', 'did', 'not', 'hear', 'story', 'many', 'women', 'have', 'heard', 'same', 'with', 'paralyzed', 'inability', '

In [None]:
filtered_words = [w for w in tokens if len(w) >= 2]

In [None]:
print(filtered_words)

['story', 'hour', 'kate', 'chopin', 'knowing', 'that', 'mrs', 'mallard', 'was', 'afflicted', 'with', 'heart', 'trouble', 'great', 'care', 'was', 'taken', 'break', 'her', 'gently', 'possible', 'news', 'her', 'husband', 'death', 'it', 'was', 'her', 'sister', 'josephine', 'who', 'told', 'her', 'broken', 'sentences', 'veiled', 'hints', 'that', 'revealed', 'half', 'concealing', 'her', 'husband', 'friend', 'richards', 'was', 'there', 'too', 'near', 'her', 'it', 'was', 'he', 'who', 'had', 'been', 'newspaper', 'office', 'when', 'intelligence', 'railroad', 'disaster', 'was', 'received', 'with', 'brently', 'mallard', 'name', 'leading', 'list', 'killed', 'he', 'had', 'only', 'taken', 'time', 'assure', 'himself', 'its', 'truth', 'by', 'second', 'telegram', 'had', 'hastened', 'forestall', 'any', 'less', 'careful', 'less', 'tender', 'friend', 'bearing', 'sad', 'message', 'she', 'did', 'not', 'hear', 'story', 'many', 'women', 'have', 'heard', 'same', 'with', 'paralyzed', 'inability', 'accept', 'its',

In [None]:
bi_grams = list(ngrams(filtered_words, 2)) 
counter = Counter(bi_grams)
counter.most_common()

[(('it', 'was'), 6),
 (('she', 'was'), 5),
 (('her', 'sister'), 4),
 (('did', 'not'), 4),
 (('she', 'had'), 4),
 (('had', 'been'), 3),
 (('she', 'did'), 3),
 (('she', 'would'), 3),
 (('that', 'had'), 3),
 (('with', 'her'), 3),
 (('there', 'was'), 3),
 (('her', 'eyes'), 3),
 (('would', 'be'), 3),
 (('open', 'door'), 3),
 (('her', 'husband'), 2),
 (('brently', 'mallard'), 2),
 (('he', 'had'), 2),
 (('would', 'have'), 2),
 (('no', 'one'), 2),
 (('open', 'window'), 2),
 (('her', 'body'), 2),
 (('some', 'one'), 2),
 (('one', 'was'), 2),
 (('patches', 'blue'), 2),
 (('blue', 'sky'), 2),
 (('her', 'she'), 2),
 (('but', 'she'), 2),
 (('when', 'she'), 2),
 (('free', 'free'), 2),
 (('joy', 'that'), 2),
 (('she', 'saw'), 2),
 (('that', 'would'), 2),
 (('there', 'would'), 2),
 (('be', 'no'), 2),
 (('door', 'with'), 2),
 (('that', 'life'), 2),
 (('life', 'might'), 2),
 (('might', 'be'), 2),
 (('be', 'long'), 2),
 (('story', 'hour'), 1),
 (('hour', 'kate'), 1),
 (('kate', 'chopin'), 1),
 (('chopin',