In [1]:
# Find cluster labels using a model already trained.

import os
import re
import time

import pickle
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans

import clustools as ct
import scraptools as sct

##  Scrapping content from new sites

In [2]:
# Scrape new sites for content.

SITES_LIST = []

new_sites_file = './site_lists/02_websites.csv'
new_content_dir = './new_contents/'

with open(new_sites_file, 'r', newline = '') as f:
    for site in f.readlines():
        SITES_LIST.append(site.strip())

print(f'The list contains {len(SITES_LIST)} sites.')

report = sct.scrape_full(SITES_LIST, contents_dir = new_content_dir)

print('='*20 + '\n Scrapping Summary \n' +'='*20 +'\n' )
print(f'{len(report["sites"])} sites requested. \n' 
      + f'Scrapping took {report["time_s"]/60:.2f} min ({report["time_s"]:.2f} s) \n' 
      + f'{len(report["succesful"]) } SUCCESFUL. \n'
      + f'{len(report["failed"]) } FAILURES. \n\n'
      + f'Contents in: {report["contents"]} \n'
      + f'Logs in: {report["logs"]} \n'
      + f'Full report: {report["report_name"]}' 
     )



The list contains 8 sites.
Text from main page www.techint.com written to ./new_contents/www.techint.com.txt
Scrapping links from www.techint.com
20 links scrapped in 7.39 seconds. 
20 links failed.
Text from main page nexco-sa.com.ar written to ./new_contents/nexco-sa.com.ar.txt
Scrapping links from nexco-sa.com.ar
13 links scrapped in 2.96 seconds. 
0 links failed.
Text from main page murphyfinnedtubes.com written to ./new_contents/murphyfinnedtubes.com.txt
Scrapping links from murphyfinnedtubes.com
20 links scrapped in 5.47 seconds. 
0 links failed.
Error in site: https://www.venuspipes.com
Text from main page www.twmetals.com written to ./new_contents/www.twmetals.com.txt
Scrapping links from www.twmetals.com
20 links scrapped in 3.66 seconds. 
0 links failed.
Text from main page www.tpsconstructora.com.ar written to ./new_contents/www.tpsconstructora.com.ar.txt
Scrapping links from www.tpsconstructora.com.ar
20 links scrapped in 13.19 seconds. 
0 links failed.
Text from main page 

##  Clustering new sites with trained model.

In [19]:
# get new data

# Gather files

# Set the contents directory
CONTENTS_DIR = './new_contents/'

# Get all file names from the directory.

file_names = [file for file in os.listdir(CONTENTS_DIR)]

# Read the text of each file
file_contents = []

for name in file_names:
    
    with open(CONTENTS_DIR + name, 'r') as content:
        site_text = content.read()
    
    file_contents.append(site_text)
    
# Store contents in dataframe

websites_df = pd.DataFrame({'site': map(lambda name: name.replace('.txt','' ), file_names),
                            'raw_text': file_contents})

# Add column with split text, and one with the length of the split text.
websites_df['wordcount'] = websites_df['raw_text'].apply(lambda mytext: len(mytext.split()))

# Drop short or empty texts

min_words = 100

websites_df.drop(websites_df[websites_df['wordcount']< min_words].index, inplace=True)

In [20]:
websites_df

Unnamed: 0,site,raw_text,wordcount
0,murphyfinnedtubes.com,Murphy Finned Tubes | Fin Tubes | Finned Coils...,18871
1,nexco-sa.com.ar,NEXCO | Exportación de Miel y Cera de Abejas\n...,4152
2,www.lomanegra.com.ar,"Loma Negra, Líder en la producción y comercial...",15112
3,www.techint.com,Home\nes\nen\npt\nMenú\nCerrar\nQuiénes somos\...,827
4,www.tpsconstructora.com.ar,TPS – Constructora – Empresa Constructora\nEdu...,9380
5,www.twmetals.com,Specialty Metals Suppliers - Industrial Metal ...,34724
6,www.ypf.com,"Naftas, Gas y Petróleo | YPF\nLa ubicación de ...",34484


In [21]:
# Load models

model_directories = './cluster_results/'

vectorizer_file = model_directories + '20210126_1557_fitted_tfidf.pickle'
reducer_file = model_directories + '20210126_1557_fitted_TSVD.pickle'
KMeans_file = model_directories + '/20210126_1557_fitted_kmeans_model.pickle'

KMeans_model = pickle.load(open(KMeans_file, 'rb'))
vectorizer = pickle.load(open(vectorizer_file, 'rb'))
reducer = pickle.load(open(reducer_file, 'rb'))

In [22]:
# vectorization

dtm = vectorizer.transform(websites_df['raw_text'])

  'stop_words.' % sorted(inconsistent))


In [23]:
# dimensional reduction

reduced_dtm = reducer.transform(dtm)

In [24]:
reduced_dtm.shape

(7, 7)

In [25]:
websites_df['cluster_label'] = KMeans_model.predict(reduced_dtm)

In [27]:
websites_df

Unnamed: 0,site,raw_text,wordcount,cluster_label
0,murphyfinnedtubes.com,Murphy Finned Tubes | Fin Tubes | Finned Coils...,18871,0
1,nexco-sa.com.ar,NEXCO | Exportación de Miel y Cera de Abejas\n...,4152,5
2,www.lomanegra.com.ar,"Loma Negra, Líder en la producción y comercial...",15112,5
3,www.techint.com,Home\nes\nen\npt\nMenú\nCerrar\nQuiénes somos\...,827,5
4,www.tpsconstructora.com.ar,TPS – Constructora – Empresa Constructora\nEdu...,9380,5
5,www.twmetals.com,Specialty Metals Suppliers - Industrial Metal ...,34724,0
6,www.ypf.com,"Naftas, Gas y Petróleo | YPF\nLa ubicación de ...",34484,5


In [26]:
# Save clustered sites and cluster descriptions.

results_dir = './cluster_results/'

timestr = time.strftime("%Y%m%d_%H%M", time.localtime())

clustered_sites_file = results_dir + timestr +'_clustered_sites.csv'

websites_df.sort_values('cluster_label').to_csv(clustered_sites_file ,columns=['site', 'cluster_label'])