In [2]:
import nltk

In [3]:
paragraph = """The Monetary Authority of Singapore(MAS) came into being 50 years ago. It was conceived as a dedicated organisation to carry out the specialised functions of central banking and financial regulation. MAS’ responsibilities have expanded considerably since its inception, but its policy objectives remain the same. Low inflation, in support of sustained economic growth. Healthy official foreign reserves, a sound financial sector, resilient against shocks and a vibrant international financial centre, adding value and creating jobs. MAS has played a major role in Singapore’s economic and financial development. Its monetary policies and management of official reserves have sustained macroeconomic stability and confidence in Singapore. Its regulation and supervision of financial institutions have created a safe and internationally trusted financial system and its development strategies have made the financial sector an engine of growth, attracting investments and creating many good jobs. Key to this was MAS’ combination of caution and creativity: It adhered to sound economic principles, while creatively adapting policy frameworks to suit Singapore’s context. It set high regulatory and supervisory standards, while taking a facilitative and risk-proportionate approach. It ensured financial stability, while promoting innovation and seizing opportunities. MAS did not achieve this alone. It shares credit for the success with the broader Singapore system within which MAS operates. It enjoys the political stability, healthy public finances, good public administration, and the rule of law. These all enabled MAS to pursue its objectives, single-mindedly and professionally. MAS also collaborated closely with the private sector, tapping on industry expertise, listening carefully to feedback, co-creating infrastructure and solutions, and jointly promoting the development of financial markets."""

## Bag of Words

In [16]:
# cleaning the text
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

sentences = nltk.sent_tokenize(paragraph)
sentences[:5]

['The Monetary Authority of Singapore(MAS) came into being 50 years ago.',
 'It was conceived as a dedicated organisation to carry out the specialised functions of central banking and financial regulation.',
 'MAS’ responsibilities have expanded considerably since its inception, but its policy objectives remain the same.',
 'Low inflation, in support of sustained economic growth.',
 'Healthy official foreign reserves, a sound financial sector, resilient against shocks and a vibrant international financial centre, adding value and creating jobs.']

In [20]:
#  store cleaned text in corpus using stemmer
# can also use lemmatizer
corpus = []

for i in range(len(sentences)):
    text = re.sub('[^a-zA-Z]', ' ', sentences[i])
    text = text.lower()
    text = text.split()
    text = [lemmatizer.lemmatize(t) for t in text if t not in set(stopwords.words('english'))]
    text = ' '.join(text)
    corpus.append(text)

corpus[:5]

['monetary authority singapore ma came year ago',
 'conceived dedicated organisation carry specialised function central banking financial regulation',
 'ma responsibility expanded considerably since inception policy objective remain',
 'low inflation support sustained economic growth',
 'healthy official foreign reserve sound financial sector resilient shock vibrant international financial centre adding value creating job']

In [30]:
# create bag of words / featurised model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
X[:2]

array([[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])