In [1]:
import nltk

In [2]:
paragraph = "Long-term, sustainable agricultural development will depend on conservation-oriented natural resource management. These resources include land and soil, water, plants, and animals. Although the details of natural resource management (NRM) and their application to agricultural challenges may depend on science, it is also the case that economic and social policies have a large role in providing incentives as well as disincentives for managing natural resources in ways that are both sustainable and profitable.  Successful management also needs to be considered at multiple levels: from the human scale of the household and small farm to larger arenas such as watersheds. As evidenced in new approaches to climate change, there are aspects of natural resource management that need to be considered from a global perspective."

In [3]:
#cleaning the texts
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [5]:
ps = PorterStemmer()
wordnet = WordNetLemmatizer()
sentence = nltk.sent_tokenize(paragraph)
corpus = []
for i in range(len(sentence)):
    review = re.sub('[^A-Za-z]', ' ' , sentence[i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [6]:
review

'evidenc new approach climat chang aspect natur resourc manag need consid global perspect'

In [8]:
sentence

['Long-term, sustainable agricultural development will depend on conservation-oriented natural resource management.',
 'These resources include land and soil, water, plants, and animals.',
 'Although the details of natural resource management (NRM) and their application to agricultural challenges may depend on science, it is also the case that economic and social policies have a large role in providing incentives as well as disincentives for managing natural resources in ways that are both sustainable and profitable.',
 'Successful management also needs to be considered at multiple levels: from the human scale of the household and small farm to larger arenas such as watersheds.',
 'As evidenced in new approaches to climate change, there are aspects of natural resource management that need to be considered from a global perspective.']

# stemming

In [7]:
corpus

['long term sustain agricultur develop depend conserv orient natur resourc manag',
 'resourc includ land soil water plant anim',
 'although detail natur resourc manag nrm applic agricultur challeng may depend scienc also case econom social polici larg role provid incent well disincent manag natur resourc way sustain profit',
 'success manag also need consid multipl level human scale household small farm larger arena watersh',
 'evidenc new approach climat chang aspect natur resourc manag need consid global perspect']

In [10]:
sentence = nltk.sent_tokenize(paragraph)
corpus = []
for i in range(len(sentence)):
    review = re.sub('[^A-Za-z]', ' ' , sentence[i])
    review = review.lower()
    review = review.split()
    review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [11]:
sentence

['Long-term, sustainable agricultural development will depend on conservation-oriented natural resource management.',
 'These resources include land and soil, water, plants, and animals.',
 'Although the details of natural resource management (NRM) and their application to agricultural challenges may depend on science, it is also the case that economic and social policies have a large role in providing incentives as well as disincentives for managing natural resources in ways that are both sustainable and profitable.',
 'Successful management also needs to be considered at multiple levels: from the human scale of the household and small farm to larger arenas such as watersheds.',
 'As evidenced in new approaches to climate change, there are aspects of natural resource management that need to be considered from a global perspective.']

In [13]:
len(sentence)

5

# lemmatization

In [12]:
corpus

['long term sustainable agricultural development depend conservation oriented natural resource management',
 'resource include land soil water plant animal',
 'although detail natural resource management nrm application agricultural challenge may depend science also case economic social policy large role providing incentive well disincentive managing natural resource way sustainable profitable',
 'successful management also need considered multiple level human scale household small farm larger arena watershed',
 'evidenced new approach climate change aspect natural resource management need considered global perspective']

In [14]:
#creating the bag of words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

In [15]:
X

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0],
       [1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
        0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 2, 0, 0, 1, 0, 0, 0, 1, 1,
        1, 2, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1],
       [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
        1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 