In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_lg")

By using a user input

In [3]:
sentence = input("Enter a sentence.")

In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [5]:
text_tokens = word_tokenize(sentence)

In [11]:
sw = [word for word in text_tokens if not word in stopwords.words()]

print(sw)

['spaCy', 'open-source', 'software', 'library', 'advanced', 'natural', 'language', 'processing', ',', 'written', 'programming', 'languages', 'Python', 'Cython', "'", ',', "'", 'The', 'library', 'published', 'the\\nMIT', 'license', 'developers', 'Matthew', 'Honnibal', 'Ines', 'Montani', ',', 'founders', 'ofthe', 'software', 'company', 'Explosion']


In [7]:
doc = nlp(sentence)
print(doc.ents)


(Python, Cython, Matthew Honnibal, Ines Montani)


By using a file


In [8]:
with open("nlp.txt", "r") as f:
    sentences = [line for line in f.read().split(".") if line != ""]

In [9]:
print(sentences)

['spaCy is an open-source software library for advanced natural language processing,written in the programming languages Python and Cython', ' The library is published under the\nMIT license and its main developers are Matthew Honnibal and Ines Montani, the founders ofthe software company Explosion']


In [10]:
for i in sentences:
    doc = nlp(i)
    print(doc.ents)

(Python, Cython)
(MIT, Matthew Honnibal, Ines Montani)


# By using NLTK

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hariprasath\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hariprasath\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hariprasath\AppData\Roaming\nltk_data...


True

In [3]:
def extract_keywords(text):
    
    tokens = word_tokenize(text)
    
    
    stop_words = set(stopwords.words('english'))
    tokens = [t.lower() for t in tokens if t.lower() not in stop_words and t.isalpha()]
    
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    
    
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform(tokens)
    scores = zip(vectorizer.get_feature_names(), tfidf.sum(axis=0).tolist()[0])
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    
    
    return [s[0] for s in sorted_scores][:10]

In [4]:
text = input("Enter a sentence.")

In [5]:
keywords = extract_keywords(text)
print(keywords)

['language', 'library', 'software', 'advanced', 'company', 'cython', 'developer', 'explosion', 'founder', 'honnibal']




# Nltk_RAKE

In [6]:
from rake_nltk import Rake

In [7]:
rake_nltk_var = Rake()

In [8]:
sentence = input("Enter a sentence.")

In [10]:
rake_nltk_var.extract_keywords_from_text(text)
keyword_extracted = rake_nltk_var.get_ranked_phrases()
for ele in keyword_extracted:
    print(ele)

founders ofthe software company explosion
advanced natural language processing
source software library
programming languages python
mit license
matthew honnibal
main developers
ines montani
library
written
spacy
published
open
cython
