In [1]:
import nltk
from nltk.tokenize import sent_tokenize , word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.models import Word2Vec

import spacy
import pandas as pd

import string
import numpy as np
import re
import textwrap

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer

In [2]:
stopwords =  stopwords.words('english')
lem  = WordNetLemmatizer()
NUM_CLUSTERS = 10
iterations = 10

In [3]:
text = '''
Millions go missing at China bank
Two senior officials at one of China's top commercial banks have reportedly disappeared after funds
worth up to $120m (£64m) went missing.
The pair both worked at Bank of China in the northern city of Harbin, the South China Morning Post
reported. The latest scandal at Bank of China will do nothing to reassure foreign investors that China's
big four banks are ready for international listings. Government policy sees the bank listings as vital
economic reforms. Bank of China is one of two frontrunners in the race to list overseas. The other is
China Construction Bank. Both are expected to list abroad during 2005.
They shared a $45bn state bailout in 2003, to help clean up their balance sheets in preparation for a
foreign stock market debut.
However, a report in the China-published Economic Observer said on Monday that the two banks may
have scrapped plans to list in New York because of the cost of meeting regulatory requirements
imposed since the Enron scandal. Bank of China is the country's biggest foreign exchange dealer, while
China Construction Bank is the largest deposit holder. China's banking sector is burdened with at least
$190bn of bad debt according to official data, though most observers believe the true figure is far
higher. Officially, one in five loans is not being repaid. Attempts to strengthen internal controls and
tighten lending policies have uncovered a succession of scandals involving embezzlement by bank
officials and loans-for-favours. The most high-profile case involved the ex-president of Bank of China,
Wang Xuebing, jailed for 12 years in 2003. Although, he committed the offences whilst running Bank
of China in New York, Mr.Wang was head of China Construction Bank when the scandal broke. Earlier
this month, a China Construction Bank branch manager was jailed for life in a separate case.
China's banks used to act as cash offices for state enterprises and did not require checks on credit
worthiness. The introduction of market reforms has been accompanied by attempts to modernize the
banking sector, but links between banks and local government remain strong. Last year, China's
premier, Wen Jiabao, targeted bank lending practices in a series of speeches, and regulators ordered
all big loans to be scrutinized, in an attempt to cool down irresponsible lending. China's leaders see
reforming the top four banks as vital to distribute capital to profitable companies and protect the health
of China's economic boom. But two problems persist. First, inefficient state enterprises continue to
receive protection from bankruptcy because they employ large numbers of people. Second, many
questionable loans come not from the big four, but from smaller banks. Another high-profile financial
firm, China Life, is facing shareholder lawsuits and a probe by the US Securities and Exchange
Commission following its 2004 New York listing over its failure to disclose accounting irregularities
at its parent company.
'''

In [4]:
def average_word_vectors(tokens, model, vector_size):
    vector_sum = np.zeros(vector_size)
    count = 0
    for word in tokens:
        if word in model.wv:
            vector_sum += model.wv[word]
            count += 1
    if count != 0:
        return vector_sum / count
    else:
            return np.zeros(vector_size)


In [5]:
def preprocess(text):
    words = word_tokenize(text)
    words_lower = [word.lower() for word in words ]
    words_strip = [word.strip() for word in words_lower] 
    words_remove_stopwords = [word for word in words_strip if word not in stopwords]
    words_remove_punc = [word for word in words_remove_stopwords if
    word not in string.punctuation]
    words_lemmatised = [lem.lemmatize(word) for word in words_remove_punc]
    words_joined = " ".join(words_lemmatised)
    return words_joined

In [6]:
sentences = sent_tokenize(text)
sentence_df = pd.DataFrame(sentences)
sentence_df.columns = ['Sentence']
sentence_df['Clean_Sentence'] = sentence_df['Sentence'].apply(preprocess)
sentence_df

Unnamed: 0,Sentence,Clean_Sentence
0,\nMillions go missing at China bank\nTwo senio...,million go missing china bank two senior offic...
1,The pair both worked at Bank of China in the n...,pair worked bank china northern city harbin so...
2,The latest scandal at Bank of China will do no...,latest scandal bank china nothing reassure for...
3,Government policy sees the bank listings as vi...,government policy see bank listing vital econo...
4,Bank of China is one of two frontrunners in th...,bank china one two frontrunners race list over...
5,The other is\nChina Construction Bank.,china construction bank
6,Both are expected to list abroad during 2005.,expected list abroad 2005
7,"They shared a $45bn state bailout in 2003, to ...",shared 45bn state bailout 2003 help clean bala...
8,"However, a report in the China-published Econo...",however report china-published economic observ...
9,Bank of China is the country's biggest foreign...,bank china country 's biggest foreign exchange...


In [7]:
vectorizer_bow = CountVectorizer(max_features=1000)
X_bow = vectorizer_bow.fit_transform(sentence_df['Clean_Sentence']).toarray()
vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(sentence_df['Clean_Sentence']).toarray()

In [None]:
cbow_model = Word2Vec(sentences=sentence_df['Clean_Sentence'].apply(lambda x: x.split()), vector_size=100, window=5, sg=0, min_count=1) 
X_cbow = np.array(sentence_df['Clean_Sentence'].apply(lambda x: average_word_vectors(x, cbow_model, 100)).tolist())
skipgram_model = Word2Vec(sentences=sentence_df['Clean_Sentence'].apply(lambda x: x.split()), vector_size=100, window=5, sg=1, min_count=1) 
X_skipgram = np.array(sentence_df['Clean_Sentence'].apply(lambda x: average_word_vectors(x, skipgram_model, 100)).tolist())
from gensim.models import KeyedVectors
word2vec_glove_path = 'glove.6B.50d.txt'
glove_model = KeyedVectors.load_word2vec_format(word2vec_glove_path, binary=False)
def average_word(tokens, model, vector_size):
    vector_sum = np.zeros(vector_size)
    count = 0
    for word in tokens:
        if word in model:
            vector_sum += model[word]
            count += 1
    if count != 0:
        return vector_sum / count
    else:
        return np.zeros(vector_size)
X_glove = np.array(sentence_df['Clean_Sentence'].apply(lambda x: average_word(x.split(), glove_model, 100)).tolist())


In [None]:
from gensim.models import FastText
model_path_bin = 'cc.en.300.bin'
gensim_model = FastText.load_fasttext_format(model_path_bin)
X_fasttext = np.array(sentence_df['Preprocessed'].apply(lambda x: np.mean([gensim_model.wv[word] for word in x.split() if word in gensim_model.wv], axis=0)).tolist())
gensim_model = FastText.load_fasttext_format(model_path_bin)

In [11]:
sentence_bow = X_bow.sum(axis=1)
sentence_df['Sentence_bow'] = sentence_bow
top_sentences_bow = sentence_df.sort_values('Sentence_bow',
ascending=False).head(5)['Sentence'].tolist()
summary_bow = ' '.join(top_sentences_bow)
print(textwrap.fill(summary_bow))

Another high-profile financial firm, China Life, is facing shareholder
lawsuits and a probe by the US Securities and Exchange Commission
following its 2004 New York listing over its failure to disclose
accounting irregularities at its parent company. However, a report in
the China-published Economic Observer said on Monday that the two
banks may have scrapped plans to list in New York because of the cost
of meeting regulatory requirements imposed since the Enron scandal.
Last year, China's premier, Wen Jiabao, targeted bank lending
practices in a series of speeches, and regulators ordered all big
loans to be scrutinized, in an attempt to cool down irresponsible
lending.  Millions go missing at China bank Two senior officials at
one of China's top commercial banks have reportedly disappeared after
funds worth up to $120m (£64m) went missing. China's banking sector is
burdened with at least $190bn of bad debt according to official data,
though most observers believe the true figure is fa

In [12]:
sentence_tfidf = X_tfidf.sum(axis=1)
sentence_df['Sentence_tfidf'] = sentence_tfidf
top_sentences_tfidf = sentence_df.sort_values('Sentence_tfidf',
ascending=False).head(5)['Sentence'].tolist()
          
summary_tfidf = ' '.join(top_sentences_tfidf)
print(textwrap.fill(summary_tfidf))


Another high-profile financial firm, China Life, is facing shareholder
lawsuits and a probe by the US Securities and Exchange Commission
following its 2004 New York listing over its failure to disclose
accounting irregularities at its parent company. However, a report in
the China-published Economic Observer said on Monday that the two
banks may have scrapped plans to list in New York because of the cost
of meeting regulatory requirements imposed since the Enron scandal.
Last year, China's premier, Wen Jiabao, targeted bank lending
practices in a series of speeches, and regulators ordered all big
loans to be scrutinized, in an attempt to cool down irresponsible
lending. China's banking sector is burdened with at least $190bn of
bad debt according to official data, though most observers believe the
true figure is far higher.  Millions go missing at China bank Two
senior officials at one of China's top commercial banks have
reportedly disappeared after funds worth up to $120m (£64m) went

In [14]:
sentence_cbow = X_cbow.sum(axis=1)
sentence_df['Sentence_cbow'] = sentence_cbow
top_sentences_cbow = sentence_df.sort_values('Sentence_cbow',
ascending=False).head(5)['Sentence'].tolist()
summary_cbow = ' '.join(top_sentences_cbow)
print(textwrap.fill(summary_cbow))


 Millions go missing at China bank Two senior officials at one of
China's top commercial banks have reportedly disappeared after funds
worth up to $120m (£64m) went missing. The pair both worked at Bank of
China in the northern city of Harbin, the South China Morning Post
reported. Second, many questionable loans come not from the big four,
but from smaller banks. First, inefficient state enterprises continue
to receive protection from bankruptcy because they employ large
numbers of people. China's leaders see reforming the top four banks as
vital to distribute capital to profitable companies and protect the
health of China's economic boom.


In [13]:
sentence_skipgram = X_skipgram.sum(axis=1)
sentence_df['Sentence_skipgram'] = sentence_skipgram
      
top_sentences_skipgram = sentence_df.sort_values('Sentence_skipgram', ascending=False).head(5)['Sentence'].tolist()
summary_skipgram = ' '.join(top_sentences_skipgram)
print(textwrap.fill(summary_skipgram))


 Millions go missing at China bank Two senior officials at one of
China's top commercial banks have reportedly disappeared after funds
worth up to $120m (£64m) went missing. The pair both worked at Bank of
China in the northern city of Harbin, the South China Morning Post
reported. Second, many questionable loans come not from the big four,
but from smaller banks. First, inefficient state enterprises continue
to receive protection from bankruptcy because they employ large
numbers of people. China's leaders see reforming the top four banks as
vital to distribute capital to profitable companies and protect the
health of China's economic boom.


In [None]:
def average_word(tokens, model, vector_size):
    vector_sum = np.zeros(vector_size)
    count = 0
    for word in tokens:
        if word in model.wv:
            vector_sum += model.wv[word]
            count += 1
    if count != 0:
        return vector_sum / count
    else:
        return np.zeros(vector_size)

X_glove = np.array(sentence_df['Clean_Sentence'].apply(lambda x: average_word(x.split(), glove_model, 100)).tolist())
sentence_glove = X_glove.sum(axis=1)
sentence_df['Sentence_glove'] = sentence_glove
top_sentences_glove = sentence_df.sort_values('Sentence_glove',
ascending=False).head(5)['Sentence'].tolist()
summary_glove = ' '.join(top_sentences_glove)
print(textwrap.fill(summary_glove))



In [None]:
sentence_fasttext = X_fasttext.sum(axis=1)
sentence_df['Sentence_fasttext'] = sentence_fasttext
top_sentences_fasttext = sentence_df.sort_values('Sentence_fasttext',
ascending=False).head(5)['Sentence'].tolist()
          
summary_fasttext = ' '.join(top_sentences_fasttext)
print(textwrap.fill(summary_fasttext))