In [2]:
import io
 
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfpage import PDFPage

def extract_text_from_pdf(pdf_path):
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
 
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)
 
        text = fake_file_handle.getvalue()
    #print(text)
    # close open handles
    converter.close()
    fake_file_handle.close()
 
    if text:
        return text

In [3]:
text1=extract_text_from_pdf('word.pdf')
text1

' 1. What is Data? Data is the information which is stored by a computer. This data can be of any form i.e., text documents, images, audios, videos etc. This data can be processessed from one computer to another computer or devices using internet.    2. How are Data generating? Data generating is the process of creating data from the sampled collected data. It is done once the data collection process is completed. It analysis the collected data and creates the processed data from it. 3. What is Big Data? Big data consists of a vast data with a voluminous storage. This collected from different data sources which is comparatively larger than the traditional data processing softwares. This vast data can be used in order to access for different purpose that could solve business problems.  \x0c'

In [4]:
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize

In [5]:
text_s= nltk.sent_tokenize(text1)
text_s

[' 1.',
 'What is Data?',
 'Data is the information which is stored by a computer.',
 'This data can be of any form i.e., text documents, images, audios, videos etc.',
 'This data can be processessed from one computer to another computer or devices using internet.',
 '2.',
 'How are Data generating?',
 'Data generating is the process of creating data from the sampled collected data.',
 'It is done once the data collection process is completed.',
 'It analysis the collected data and creates the processed data from it.',
 '3.',
 'What is Big Data?',
 'Big data consists of a vast data with a voluminous storage.',
 'This collected from different data sources which is comparatively larger than the traditional data processing softwares.',
 'This vast data can be used in order to access for different purpose that could solve business problems.']

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chaithu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
import re
punctuation=re.compile(r'[-.@#%^*<>=&$?!/\,:;()|0-9]')

In [8]:
text_punc=[]
for i in text_s:
    i=punctuation.sub("",i)
    if len(i)>0:
        text_punc.append(i)

In [9]:
text_punc

[' ',
 'What is Data',
 'Data is the information which is stored by a computer',
 'This data can be of any form ie text documents images audios videos etc',
 'This data can be processessed from one computer to another computer or devices using internet',
 'How are Data generating',
 'Data generating is the process of creating data from the sampled collected data',
 'It is done once the data collection process is completed',
 'It analysis the collected data and creates the processed data from it',
 'What is Big Data',
 'Big data consists of a vast data with a voluminous storage',
 'This collected from different data sources which is comparatively larger than the traditional data processing softwares',
 'This vast data can be used in order to access for different purpose that could solve business problems']

In [10]:
text_word=[]
for i in text_punc:
    text_word.append(word_tokenize(i))

In [11]:
text_word

[[],
 ['What', 'is', 'Data'],
 ['Data',
  'is',
  'the',
  'information',
  'which',
  'is',
  'stored',
  'by',
  'a',
  'computer'],
 ['This',
  'data',
  'can',
  'be',
  'of',
  'any',
  'form',
  'ie',
  'text',
  'documents',
  'images',
  'audios',
  'videos',
  'etc'],
 ['This',
  'data',
  'can',
  'be',
  'processessed',
  'from',
  'one',
  'computer',
  'to',
  'another',
  'computer',
  'or',
  'devices',
  'using',
  'internet'],
 ['How', 'are', 'Data', 'generating'],
 ['Data',
  'generating',
  'is',
  'the',
  'process',
  'of',
  'creating',
  'data',
  'from',
  'the',
  'sampled',
  'collected',
  'data'],
 ['It',
  'is',
  'done',
  'once',
  'the',
  'data',
  'collection',
  'process',
  'is',
  'completed'],
 ['It',
  'analysis',
  'the',
  'collected',
  'data',
  'and',
  'creates',
  'the',
  'processed',
  'data',
  'from',
  'it'],
 ['What', 'is', 'Big', 'Data'],
 ['Big',
  'data',
  'consists',
  'of',
  'a',
  'vast',
  'data',
  'with',
  'a',
  'volumino

In [12]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
  
#example_sent = "This is a sample sentence, showing off the stop words filtration."
  
stop_words = set(stopwords.words('english')) 
  
#word_tokens = word_tokenize(text1) 
  
text_sw = [w for w in text_punc if not w in stop_words] 
  
text_sw= [] 
  
for w in text_punc: 
    if w not in text_sw: 
        text_sw.append(w) 
  
#print(word_tokens) 
print(text_sw) 
#len(filtered_sentence)

[' ', 'What is Data', 'Data is the information which is stored by a computer', 'This data can be of any form ie text documents images audios videos etc', 'This data can be processessed from one computer to another computer or devices using internet', 'How are Data generating', 'Data generating is the process of creating data from the sampled collected data', 'It is done once the data collection process is completed', 'It analysis the collected data and creates the processed data from it', 'What is Big Data', 'Big data consists of a vast data with a voluminous storage', 'This collected from different data sources which is comparatively larger than the traditional data processing softwares', 'This vast data can be used in order to access for different purpose that could solve business problems']


In [13]:
from nltk.stem import PorterStemmer
pst= PorterStemmer()
sub_stem=[]
for words in text_word:
    for i in words:
        sub_stem.append(i+ ":" +pst.stem(i))
sub_stem

['What:what',
 'is:is',
 'Data:data',
 'Data:data',
 'is:is',
 'the:the',
 'information:inform',
 'which:which',
 'is:is',
 'stored:store',
 'by:by',
 'a:a',
 'computer:comput',
 'This:thi',
 'data:data',
 'can:can',
 'be:be',
 'of:of',
 'any:ani',
 'form:form',
 'ie:ie',
 'text:text',
 'documents:document',
 'images:imag',
 'audios:audio',
 'videos:video',
 'etc:etc',
 'This:thi',
 'data:data',
 'can:can',
 'be:be',
 'processessed:processess',
 'from:from',
 'one:one',
 'computer:comput',
 'to:to',
 'another:anoth',
 'computer:comput',
 'or:or',
 'devices:devic',
 'using:use',
 'internet:internet',
 'How:how',
 'are:are',
 'Data:data',
 'generating:gener',
 'Data:data',
 'generating:gener',
 'is:is',
 'the:the',
 'process:process',
 'of:of',
 'creating:creat',
 'data:data',
 'from:from',
 'the:the',
 'sampled:sampl',
 'collected:collect',
 'data:data',
 'It:It',
 'is:is',
 'done:done',
 'once:onc',
 'the:the',
 'data:data',
 'collection:collect',
 'process:process',
 'is:is',
 'comple

In [14]:
from nltk.stem import WordNetLemmatizer
lem=WordNetLemmatizer()
lem_stem=[]
for words in text_word:
    for i in words:
        lem_stem.append(i+ ":" +lem.lemmatize(i))
lem_stem

['What:What',
 'is:is',
 'Data:Data',
 'Data:Data',
 'is:is',
 'the:the',
 'information:information',
 'which:which',
 'is:is',
 'stored:stored',
 'by:by',
 'a:a',
 'computer:computer',
 'This:This',
 'data:data',
 'can:can',
 'be:be',
 'of:of',
 'any:any',
 'form:form',
 'ie:ie',
 'text:text',
 'documents:document',
 'images:image',
 'audios:audio',
 'videos:video',
 'etc:etc',
 'This:This',
 'data:data',
 'can:can',
 'be:be',
 'processessed:processessed',
 'from:from',
 'one:one',
 'computer:computer',
 'to:to',
 'another:another',
 'computer:computer',
 'or:or',
 'devices:device',
 'using:using',
 'internet:internet',
 'How:How',
 'are:are',
 'Data:Data',
 'generating:generating',
 'Data:Data',
 'generating:generating',
 'is:is',
 'the:the',
 'process:process',
 'of:of',
 'creating:creating',
 'data:data',
 'from:from',
 'the:the',
 'sampled:sampled',
 'collected:collected',
 'data:data',
 'It:It',
 'is:is',
 'done:done',
 'once:once',
 'the:the',
 'data:data',
 'collection:collecti

In [15]:
text_low=[x.lower() for x in text_punc]
text_low

[' ',
 'what is data',
 'data is the information which is stored by a computer',
 'this data can be of any form ie text documents images audios videos etc',
 'this data can be processessed from one computer to another computer or devices using internet',
 'how are data generating',
 'data generating is the process of creating data from the sampled collected data',
 'it is done once the data collection process is completed',
 'it analysis the collected data and creates the processed data from it',
 'what is big data',
 'big data consists of a vast data with a voluminous storage',
 'this collected from different data sources which is comparatively larger than the traditional data processing softwares',
 'this vast data can be used in order to access for different purpose that could solve business problems']

In [16]:
nltk.download('stopwords')
from nltk.corpus import stopwords
sw = stopwords.words('english')
for i in range(len(text_s)):
    words = nltk.word_tokenize(text_s[i])
    words = [word for word in words if word not in stopwords.words('english')]
    text_s[i] = ' '.join(words) 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chaithu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
  
#example_sent = "This is a sample sentence, showing off the stop words filtration."
  
stop_words = set(stopwords.words('english')) 
  
#word_tokens = word_tokenize(text1) 
  
filtered_sentence = [w for w in sub_stem if not w in stop_words] 
  
filtered_sentence = [] 
  
for w in sub_stem: 
    if w not in stop_words: 
        filtered_sentence.append(w) 
  
#print(word_tokens) 
print(filtered_sentence) 
len(filtered_sentence)

['What:what', 'is:is', 'Data:data', 'Data:data', 'is:is', 'the:the', 'information:inform', 'which:which', 'is:is', 'stored:store', 'by:by', 'a:a', 'computer:comput', 'This:thi', 'data:data', 'can:can', 'be:be', 'of:of', 'any:ani', 'form:form', 'ie:ie', 'text:text', 'documents:document', 'images:imag', 'audios:audio', 'videos:video', 'etc:etc', 'This:thi', 'data:data', 'can:can', 'be:be', 'processessed:processess', 'from:from', 'one:one', 'computer:comput', 'to:to', 'another:anoth', 'computer:comput', 'or:or', 'devices:devic', 'using:use', 'internet:internet', 'How:how', 'are:are', 'Data:data', 'generating:gener', 'Data:data', 'generating:gener', 'is:is', 'the:the', 'process:process', 'of:of', 'creating:creat', 'data:data', 'from:from', 'the:the', 'sampled:sampl', 'collected:collect', 'data:data', 'It:It', 'is:is', 'done:done', 'once:onc', 'the:the', 'data:data', 'collection:collect', 'process:process', 'is:is', 'completed:complet', 'It:It', 'analysis:analysi', 'the:the', 'collected:col

130

In [18]:
from nltk.tag import DefaultTagger
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\chaithu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [19]:
tagging = DefaultTagger('NN')

In [20]:
text_pos=[]
for words in text_word:
    words=nltk.pos_tag(words)
    text_pos.append(words)
    

In [21]:
text_pos

[[],
 [('What', 'WP'), ('is', 'VBZ'), ('Data', 'NNP')],
 [('Data', 'NNP'),
  ('is', 'VBZ'),
  ('the', 'DT'),
  ('information', 'NN'),
  ('which', 'WDT'),
  ('is', 'VBZ'),
  ('stored', 'VBN'),
  ('by', 'IN'),
  ('a', 'DT'),
  ('computer', 'NN')],
 [('This', 'DT'),
  ('data', 'NN'),
  ('can', 'MD'),
  ('be', 'VB'),
  ('of', 'IN'),
  ('any', 'DT'),
  ('form', 'NN'),
  ('ie', 'NN'),
  ('text', 'NN'),
  ('documents', 'NNS'),
  ('images', 'VBZ'),
  ('audios', 'NNS'),
  ('videos', 'NNS'),
  ('etc', 'VBP')],
 [('This', 'DT'),
  ('data', 'NN'),
  ('can', 'MD'),
  ('be', 'VB'),
  ('processessed', 'VBN'),
  ('from', 'IN'),
  ('one', 'CD'),
  ('computer', 'NN'),
  ('to', 'TO'),
  ('another', 'DT'),
  ('computer', 'NN'),
  ('or', 'CC'),
  ('devices', 'NNS'),
  ('using', 'VBG'),
  ('internet', 'NN')],
 [('How', 'WRB'), ('are', 'VBP'), ('Data', 'NNP'), ('generating', 'NN')],
 [('Data', 'NNP'),
  ('generating', 'NN'),
  ('is', 'VBZ'),
  ('the', 'DT'),
  ('process', 'NN'),
  ('of', 'IN'),
  ('creating'

In [None]:
tw1= nltk.pos_tag(text_word)


word_tags = []
for tw in tw1:
    word_tags.append(tw[0]+"_"+tw[1])

tagged_paragraph = ' '.join(tw1)

tagged_paragraph