# Crawling Data

import scrapy


class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        urls = [
            'https://ekbis.sindonews.com/'

        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        # print(response.url)
        for i in range(0, 30):
            for data in response.css('body > div:nth-child(5) > section > div.grid_24 > div.homelist-new > ul'):
                yield{
                    'judul': data.css('li.latest-event.latest-track-' + str(i) + ' > div.homelist-box > div.homelist-title > a::text').extract(),

                    'waktu': data.css('li.latest-event.latest-track-' + str(i) + ' > div.homelist-box > div.homelist-top > div.homelist-date::text').extract(),

                    'category': data.css('li.latest-event.latest-track-' + str(i) + ' > div.homelist-box > div.homelist-top > div.homelist-channel::text').extract(),

                    'isi': data.css('li.latest-event.latest-track-' + str(i) + ' > div.homelist-box > div.homelist-desc::text').extract()
                }
                


# Import Module

In [1]:
# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#import nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize

#preprocessing
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import PorterStemmer,LancasterStemmer  # using the Porter Stemmer and Lancaster Stemmer and others
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet

# for named entity recognition (NER)
from nltk import ne_chunk

# vectorizers for creating the document-term-matrix (DTM)
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

#stop-words
stop_words=set(nltk.corpus.stopwords.words('indonesian'))

ModuleNotFoundError: No module named 'pandas'

# Load Dataset

In [135]:
df=pd.read_csv('wrapping-text.csv')

In [136]:
df.head()

Unnamed: 0,judul,waktu,category,isi
0,Wapres Ungkap Data Amerika Soal Ketahanan Pang...,"Selasa, 22 Maret 2022 - 17:14 WIB",Sektor Riil,"Wakil Presiden Maruf Amin mengungkapkan, berda..."
1,"Lebih Murah, YLKI Khawatir Konsumen Bakal Migr...","Selasa, 22 Maret 2022 - 16:48 WIB",Sektor Riil,Ketua Yayasan Lembaga Konsumen Indonesia (YLKI...
2,Kemenkeu Siapkan 186 Pasal yang Mengatur Penda...,"Selasa, 22 Maret 2022 - 16:24 WIB",Makro,Kementerian Keuangan tengah menyiapkan 186 pas...
3,"Menghijau, IHSG Hari Ini Ditutup Tembus Level ...","Selasa, 22 Maret 2022 - 15:37 WIB",Kurs & Saham,Indeks Harga Saham Gabungan (IHSG) hari ini me...
4,"Tumbuh Berlipat dari PDB, Ekonomi Digital Jadi...","Selasa, 22 Maret 2022 - 15:29 WIB",Makro,Laju pertumbuhan ekonomi digital Indonesia dip...


# We will drop the 'publish_date' column as it is useless for our discussion.

In [137]:
# drop the publish date.
df.drop(['judul'],axis=1,inplace=True)

In [138]:
df.drop(['waktu'],axis=1,inplace=True)

In [139]:
df.drop(['category'],axis=1,inplace=True)

In [140]:
df.head(30)

Unnamed: 0,isi
0,"Wakil Presiden Maruf Amin mengungkapkan, berda..."
1,Ketua Yayasan Lembaga Konsumen Indonesia (YLKI...
2,Kementerian Keuangan tengah menyiapkan 186 pas...
3,Indeks Harga Saham Gabungan (IHSG) hari ini me...
4,Laju pertumbuhan ekonomi digital Indonesia dip...
5,Menyusul dicabutnya aturan Harga Eceran Tertin...
6,Menteri Keuangan (Menkeu) Sri Mulyani menyatak...
7,Menteri Keuangan (Menkeu) Sri Mulyani merespon...
8,Harga minyak mentah atau crude oil mengalami k...
9,Menko Airlangga mengakui pelaku UMKM menjadi c...


# Clean Data & Preprocessing Data

#### Here I have done the data pre-processing. I have used the lemmatizer and can also use the stemmer. Also the stop words have been used along with the words wit lenght shorter than 3 characters to reduce some stray words.

In [141]:
import string 
import re #regex library

# import word_tokenize & FreqDist from NLTK
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)

df['hapus angka'] = df['isi'].apply(remove_number)
df.head(10)

Unnamed: 0,isi,hapus angka
0,"Wakil Presiden Maruf Amin mengungkapkan, berda...","Wakil Presiden Maruf Amin mengungkapkan, berda..."
1,Ketua Yayasan Lembaga Konsumen Indonesia (YLKI...,Ketua Yayasan Lembaga Konsumen Indonesia (YLKI...
2,Kementerian Keuangan tengah menyiapkan 186 pas...,Kementerian Keuangan tengah menyiapkan pasal ...
3,Indeks Harga Saham Gabungan (IHSG) hari ini me...,Indeks Harga Saham Gabungan (IHSG) hari ini me...
4,Laju pertumbuhan ekonomi digital Indonesia dip...,Laju pertumbuhan ekonomi digital Indonesia dip...
5,Menyusul dicabutnya aturan Harga Eceran Tertin...,Menyusul dicabutnya aturan Harga Eceran Tertin...
6,Menteri Keuangan (Menkeu) Sri Mulyani menyatak...,Menteri Keuangan (Menkeu) Sri Mulyani menyatak...
7,Menteri Keuangan (Menkeu) Sri Mulyani merespon...,Menteri Keuangan (Menkeu) Sri Mulyani merespon...
8,Harga minyak mentah atau crude oil mengalami k...,Harga minyak mentah atau crude oil mengalami k...
9,Menko Airlangga mengakui pelaku UMKM menjadi c...,Menko Airlangga mengakui pelaku UMKM menjadi c...


In [142]:
def clean_text(headline):
  le=WordNetLemmatizer()
  word_tokens=word_tokenize(headline)
  tokens=[le.lemmatize(w) for w in word_tokens if w not in stop_words and len(w)>3]
  cleaned_text=" ".join(tokens)
  return cleaned_text
  

In [143]:
# time taking
#nltk.download('wordnet')
df['clean_text_isi']=df['hapus angka'].apply(clean_text)

In [144]:
df.head()

Unnamed: 0,isi,hapus angka,clean_text_isi
0,"Wakil Presiden Maruf Amin mengungkapkan, berda...","Wakil Presiden Maruf Amin mengungkapkan, berda...",Wakil Presiden Maruf Amin berdasarkan data Ame...
1,Ketua Yayasan Lembaga Konsumen Indonesia (YLKI...,Ketua Yayasan Lembaga Konsumen Indonesia (YLKI...,Ketua Yayasan Lembaga Konsumen Indonesia YLKI ...
2,Kementerian Keuangan tengah menyiapkan 186 pas...,Kementerian Keuangan tengah menyiapkan pasal ...,Kementerian Keuangan pasal aturan turunan Ranc...
3,Indeks Harga Saham Gabungan (IHSG) hari ini me...,Indeks Harga Saham Gabungan (IHSG) hari ini me...,Indeks Harga Saham Gabungan IHSG mendarat zona...
4,Laju pertumbuhan ekonomi digital Indonesia dip...,Laju pertumbuhan ekonomi digital Indonesia dip...,Laju pertumbuhan ekonomi digital Indonesia dip...


#### Can see the difference after removal of stopwords and some shorter words. aslo the words have been lemmatized as in 'calls'--->'call'.

#### Now drop the unpre-processed column.

In [145]:
df.drop(['isi'],axis=1,inplace=True)

In [146]:
df.drop(['hapus angka'],axis=1,inplace=True)

In [147]:
df.head()

Unnamed: 0,clean_text_isi
0,Wakil Presiden Maruf Amin berdasarkan data Ame...
1,Ketua Yayasan Lembaga Konsumen Indonesia YLKI ...
2,Kementerian Keuangan pasal aturan turunan Ranc...
3,Indeks Harga Saham Gabungan IHSG mendarat zona...
4,Laju pertumbuhan ekonomi digital Indonesia dip...


#### We can also see any particular news headline.

In [148]:
df['clean_text_isi'][0]

'Wakil Presiden Maruf Amin berdasarkan data Amerika Serikat kemampuan bertahan cadangan pangan Indonesia'

### EXTRACTING THE FEATURES AND CREATING THE DOCUMENT-TERM-MATRIX ( DTM )
In DTM the values are the TFidf values.

Also I have specified some parameters of the Tfidf vectorizer.

Some important points:-

1) LSA is generally implemented with Tfidf values everywhere and not with the Count Vectorizer.

2) max_features depends on your computing power and also on eval. metric (coherence score is a metric for topic model). Try the value that gives best eval. metric and doesn't limits processing power.

3) Default values for min_df & max_df worked well.

4) Can try different values for ngram_range.

In [149]:
vect =TfidfVectorizer(stop_words=stop_words,max_features=1000) # to play with. min_df,max_df,max_features etc...

In [150]:
vect_text=vect.fit_transform(df['clean_text_isi'])



#### We can now see the most frequent and rare words in the news headlines based on idf score. The lesser the value; more common is the word in the news headlines.

In [151]:
print(vect_text.shape)
print(vect_text)

(30, 274)
  (0, 93)	0.2165271149567325
  (0, 192)	0.2867690431796365
  (0, 39)	0.2867690431796365
  (0, 30)	0.2867690431796365
  (0, 113)	0.2867690431796365
  (0, 241)	0.2867690431796365
  (0, 6)	0.2867690431796365
  (0, 47)	0.2867690431796365
  (0, 25)	0.25568649478108163
  (0, 7)	0.2867690431796365
  (0, 140)	0.2867690431796365
  (0, 226)	0.2867690431796365
  (0, 270)	0.25568649478108163
  (1, 130)	0.26858557741510825
  (1, 115)	0.21881885803291912
  (1, 46)	0.23947391279261487
  (1, 84)	0.17863945218504915
  (1, 176)	0.15303083349122784
  (1, 86)	0.13994047402823656
  (1, 239)	0.26858557741510825
  (1, 59)	0.26858557741510825
  (1, 160)	0.26858557741510825
  (1, 0)	0.26858557741510825
  (1, 260)	0.26858557741510825
  (1, 272)	0.26858557741510825
  :	:
  (27, 235)	0.2005516098159421
  (28, 27)	0.37180473370818173
  (28, 91)	0.37180473370818173
  (28, 109)	0.37180473370818173
  (28, 190)	0.37180473370818173
  (28, 154)	0.37180473370818173
  (28, 202)	0.37180473370818173
  (28, 12)	0.3

In [152]:
idf=vect.idf_

# Topik Modelling

# Latent Semantic Analysis (LSA)
The first approach that I have used is the LSA. LSA is basically singular value decomposition.

SVD decomposes the original DTM into three matrices S=U.(sigma).(V.T). Here the matrix U denotes the document-topic matrix while (V) is the topic-term matrix.

Each row of the matrix U(document-term matrix) is the vector representation of the corresponding document. The length of these vectors is the number of desired topics. Vector representation for the terms in our data can be found in the matrix V (term-topic matrix).

So, SVD gives us vectors for every document and term in our data. The length of each vector would be k. We can then use these vectors to find similar words and similar documents using the cosine similarity method.

We can use the truncatedSVD function to implement LSA. The n_components parameter is the number of topics we wish to extract. The model is then fit and transformed on the result given by vectorizer.

Lastly note that LSA and LSI (I for indexing) are the same and the later is just sometimes used in information retrieval contexts.

In [153]:
from sklearn.decomposition import TruncatedSVD
lsa_model = TruncatedSVD(n_components=10, algorithm='randomized', n_iter=10, random_state=42)

lsa_top=lsa_model.fit_transform(vect_text)

In [154]:
print(lsa_top)
print(lsa_top.shape)  # (no_of_doc*no_of_topics)

[[ 3.83260714e-02 -5.39978479e-02  2.40360697e-02  1.00249014e-01
   2.68847273e-02  2.81430783e-01 -2.08211071e-01 -2.65070950e-01
  -3.94853124e-01 -2.01902284e-02]
 [ 3.13807437e-01 -2.88088913e-01 -1.27088864e-01  6.96156174e-02
  -5.60164484e-02  1.54771868e-01 -1.23260783e-01 -8.79640144e-03
  -1.65713695e-01  2.06284672e-01]
 [ 4.56680032e-02 -4.83473829e-02  1.64198937e-01 -6.65621485e-02
   2.22524491e-01 -2.08648032e-01  4.27046769e-01 -3.02256368e-01
  -2.40713789e-01 -3.80016768e-02]
 [ 4.90902975e-01  5.08784858e-01 -4.41473538e-02  6.63825380e-02
   4.77093086e-02 -1.91166002e-01 -1.52399938e-01 -1.82127740e-01
   4.88693001e-02  6.36297219e-02]
 [ 5.11915043e-02 -1.54979353e-02  4.65640229e-02  5.46168223e-02
   7.12376162e-02  5.87386608e-01  6.20259322e-02 -3.76490239e-01
   1.22688958e-01  1.33280716e-01]
 [ 5.05662012e-01 -4.12831390e-01 -2.25626775e-01  9.72291718e-03
   1.84976939e-01 -1.65559591e-01  1.56010618e-01  2.88293680e-03
   1.83474259e-01 -1.19641133e-01

In [155]:
l=lsa_top[0]
print("Document 0 :")
for i,topic in enumerate(l):
  print("Topic ",i," : ",topic*100)
  

Document 0 :
Topic  0  :  3.832607135522223
Topic  1  :  -5.399784792009233
Topic  2  :  2.4036069728375615
Topic  3  :  10.02490137123063
Topic  4  :  2.6884727308327987
Topic  5  :  28.143078331853577
Topic  6  :  -20.821107122012727
Topic  7  :  -26.507095037045815
Topic  8  :  -39.48531238617555
Topic  9  :  -2.0190228364342837


#### Similalry for other documents we can do this. However note that values dont add to 1 as in LSA it is not probabiltiy of a topic in a document.

In [156]:
print(lsa_model.components_.shape) # (no_of_topics*no_of_words)
print(lsa_model.components_)

(10, 274)
[[ 0.03427003  0.00107465  0.0115964  ...  0.03427003  0.03427003
   0.05930706]
 [-0.03738414 -0.00014554  0.01246267 ... -0.03738414 -0.03738414
   0.07304079]
 [-0.02081739  0.0091207  -0.00492329 ... -0.02081739 -0.02081739
  -0.00798644]
 ...
 [-0.00176988 -0.03145656 -0.05417906 ... -0.00176988 -0.00176988
  -0.0499675 ]
 [-0.04153069  0.05615773  0.1105923  ... -0.04153069 -0.04153069
   0.0129145 ]
 [ 0.05383337  0.04312126 -0.04147381 ...  0.05383337  0.05383337
   0.01737356]]


#### Now e can get a list of the important words for each of the 10 topics as shown. For simplicity here I have shown 10 words for each topic.¶

In [157]:
# most important words for each topic
vocab = vect.get_feature_names()

for i, comp in enumerate(lsa_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

Topic 0: 
harga minyak goreng perdagangan indeks ihsg selasa gabungan saham sesi 

Topic 1: 
perdagangan indeks ihsg gabungan saham menguat selasa sesi poin level 

Topic 2: 
perusahaan keuangan bumn kenaikan april menteri dunia menkeu mulyani jasa 

Topic 3: 
perusahaan bumn jasa modal otoritas pasar usaha berdasarkan goreng mencapai 

Topic 4: 
kota pembangunan aturan negara dimana dikebut dirampungkan ditargetkan nusantara perpres 

Topic 5: 
pertumbuhan mencapai ekonomi indonesia laju digital nasional pekan terpantau bruto 

Topic 6: 
undang mudik terkait kementerian pekan terpantau aturan pasal pelaksanaan peraturan 

Topic 7: 
pekan terpantau menguat batangan emas harganya intip melemah rincian tambang 

Topic 8: 
goto saham menyusul nasional airlines average boeing china dipicu eastern 

Topic 9: 
masyarakat curah bahan bakar buying diimbau hemat kebutuhan khawatir panic 



