In [14]:
from kafka import KafkaConsumer
from pyspark import SparkConf, SparkContext, SQLContext, HiveContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from os.path import abspath
import numpy as np
import re

In [2]:
#pip install findspark
import findspark
findspark.init()

In [3]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType

schema_news = StructType([
    StructField("iso_code", StringType(), True),
    StructField("title", StringType(), True),
    StructField("description", StringType(), True),
    StructField("lang_tranlation", StringType(), True),
    StructField("title_translated", StringType(), False),
    StructField("description_translated", StringType(), True),
    StructField("pubDate", TimestampType(), False)
])

In [None]:
# Import SparkSession from pyspark.sql
from pyspark.sql import SparkSession

# Create my_spark
my_spark = SparkSession.builder.getOrCreate()

# Print my_spark
print(my_spark)

In [None]:
conf = SparkConf()
conf.setMaster("local[1]")
conf.setAppName("Consumer_Hive")
sc = SparkContext.getOrCreate(conf=conf) 
print(sc.version)

In [4]:
!hdfs dfs -ls /

Found 5 items
drwxr-xr-x   - roser supergroup          0 2025-04-10 08:51 /TFM
drwxr-xr-x   - roser supergroup          0 2025-04-09 14:46 /home
drwxr-xr-x   - roser supergroup          0 2025-04-02 12:54 /temp
drwxr-xr-x   - roser supergroup          0 2025-04-05 12:56 /tmp
drwxr-xr-x   - roser supergroup          0 2025-04-04 11:36 /user


In [4]:
!hdfs dfs -ls /TFM

Found 4 items
drwxr-xr-x   - roser supergroup          0 2025-04-09 14:46 /TFM/cleaned
drwxr-xr-x   - roser supergroup          0 2025-04-04 10:03 /TFM/news
drwxr-xr-x   - roser supergroup          0 2025-04-07 08:52 /TFM/old_news
drwxr-xr-x   - roser supergroup          0 2025-04-02 18:20 /TFM/president


In [7]:
!hdfs dfs -rm /TFM/news/news_at.csv

Deleted /TFM/news/news_at.csv


In [58]:
# Copy file from local to hdfs.
!hdfs dfs -put /home/$USER/TFM/data/pais-us.csv  /TFM
!hdfs dfs -put /home/$USER/TFM/data/paisos-ue.csv  /TFM

In [None]:
list_countries = !hdfs dfs -ls /TFM/news/ | awk '{print $NF}'

items
/TFM/news/news_at
/TFM/news/news_be
/TFM/news/news_bg
/TFM/news/news_cy
/TFM/news/news_cz
/TFM/news/news_de
/TFM/news/news_dk
/TFM/news/news_ee
/TFM/news/news_es
/TFM/news/news_fi
/TFM/news/news_fr
/TFM/news/news_gb
/TFM/news/news_gr
/TFM/news/news_hr
/TFM/news/news_hu
/TFM/news/news_ie
/TFM/news/news_it
/TFM/news/news_lt
/TFM/news/news_lu
/TFM/news/news_lv
/TFM/news/news_mt
/TFM/news/news_nl
/TFM/news/news_pl
/TFM/news/news_pt
/TFM/news/news_ro
/TFM/news/news_se
/TFM/news/news_si
/TFM/news/news_sk
/TFM/news/news_us


In [59]:
# Check if there ara corruptet block.
print("\n## Report to check for inconsistencies:\n")
!hdfs fsck /TFM/pais-us.csv


## Report to check for inconsistencies:

Connecting to namenode via http://0.0.0.0:9870/fsck?ugi=roser&path=%2FTFM%2Fpais-us.csv
FSCK started by roser (auth:SIMPLE) from /127.0.0.1 for path /TFM/pais-us.csv at Wed Apr 02 18:13:51 CEST 2025


Status: HEALTHY
 Number of data-nodes:	1
 Number of racks:		1
 Total dirs:			0
 Total symlinks:		0

Replicated Blocks:
 Total size:	108 B
 Total files:	1
 Total blocks (validated):	1 (avg. block size 108 B)
 Minimally replicated blocks:	1 (100.0 %)
 Over-replicated blocks:	0 (0.0 %)
 Under-replicated blocks:	0 (0.0 %)
 Mis-replicated blocks:		0 (0.0 %)
 Default replication factor:	1
 Average block replication:	1.0
 Missing blocks:		0
 Corrupt blocks:		0
 Missing replicas:		0 (0.0 %)
 Blocks queued for replication:	0

Erasure Coded Block Groups:
 Total size:	0 B
 Total files:	0
 Total block groups (validated):	0
 Minimally erasure-coded block groups:	0
 Over-erasure-coded block groups:	0
 Under-erasure-coded block groups:	0
 Unsatisfactory placemen

In [60]:
# Show content head.
print("\n## Sample contenet file:\n")
!hdfs dfs -cat /TFM/pais-us.csv | head -10


## Sample contenet file:

"Nom","Name","Codi ISO","Codi ISO Llengua"
"Estats Units d'Amèrica","United States of America","US","en"


In [9]:
# Iniciem SQLContext
sqlContext = SQLContext(sc)

# Carreguem el CSV en un dataframe
news_csv = sqlContext.read.csv("/TFM/pais-us.csv", header=True, inferSchema=True, sep=',')

# fem un print del count de tweets
print("\n## Loaded dataset contains %d news:\n" % news_csv.count())

# mostrem el esquema del dataframe
news_csv.printSchema()

                                                                                


## Loaded dataset contains 1 news:

root
 |-- Nom: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Codi ISO: string (nullable = true)
 |-- Codi ISO Llengua: string (nullable = true)



In [63]:
news_csv.show()

+--------------------+--------------------+--------+----------------+
|                 Nom|                Name|Codi ISO|Codi ISO Llengua|
+--------------------+--------------------+--------+----------------+
|Estats Units d'Am...|United States of ...|      US|              en|
+--------------------+--------------------+--------+----------------+



In [19]:
!hdfs dfs -put /home/$USER/TFM/data/old_news  /TFM/old_news

In [7]:
!hdfs dfs -ls /TFM/old_news

Found 651 items
-rw-r--r--   1 roser supergroup    2544177 2025-04-02 14:10 /TFM/old_news/1970_10_news_us_old.csv
-rw-r--r--   1 roser supergroup    2352088 2025-04-02 14:10 /TFM/old_news/1970_11_news_us_old.csv
-rw-r--r--   1 roser supergroup    2323784 2025-04-02 14:10 /TFM/old_news/1970_12_news_us_old.csv
-rw-r--r--   1 roser supergroup    2539819 2025-04-02 14:10 /TFM/old_news/1970_1_news_us_old.csv
-rw-r--r--   1 roser supergroup    2269022 2025-04-02 14:11 /TFM/old_news/1970_2_news_us_old.csv
-rw-r--r--   1 roser supergroup    2533818 2025-04-02 14:10 /TFM/old_news/1970_3_news_us_old.csv
-rw-r--r--   1 roser supergroup    2280036 2025-04-02 14:10 /TFM/old_news/1970_4_news_us_old.csv
-rw-r--r--   1 roser supergroup    2469840 2025-04-02 14:10 /TFM/old_news/1970_5_news_us_old.csv
-rw-r--r--   1 roser supergroup    2395103 2025-04-02 14:11 /TFM/old_news/1970_6_news_us_old.csv
-rw-r--r--   1 roser supergroup    2192998 2025-04-02 14:11 /TFM/old_news/1970_7_news_us_old.csv
-rw-r--r-- 

In [5]:
!hdfs dfs -cat /TFM/old_news/2025_3_news_us_old.csv | head -10

cat: `/TFM/old_news/2025_3_news_us_old.csv': No such file or directory


In [22]:
print(sc.version)

3.5.5


<pyspark.sql.session.SparkSession object at 0x7efdce5f22a0>


In [17]:
df=my_spark.sql("show databases")
df.show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [11]:
my_spark.sql("CREATE DATABASE IF NOT EXISTS tfm ")
print(my_spark.catalog.listDatabases())
my_spark.sql("USE tfm")

[Database(name='default', catalog='spark_catalog', description='default database', locationUri='file:/home/roser/TFM/spark-warehouse'), Database(name='tfm', catalog='spark_catalog', description='', locationUri='file:/home/roser/TFM/spark-warehouse/tfm.db')]


DataFrame[]

In [67]:
my_spark.sql("DROP TABLE IF EXISTS paisos_iso")
my_spark.sql("DROP TABLE IF EXISTS pais_iso")

# Print the tables in the catalog
print(my_spark.catalog.listTables())

[]


In [68]:
iso_us = sqlContext.read.csv("/TFM/pais-us.csv", header=True, inferSchema=True, sep=',')
iso_ue = sqlContext.read.csv("/TFM/paisos-ue.csv", header=True, inferSchema=True, sep=',')
iso_ue.write.saveAsTable("paisos_iso")
iso_us.write.mode('append').saveAsTable("paisos_iso")

In [69]:
table_ue=my_spark.sql("select * from paisos_iso")
table_ue.show(n=1000, truncate=False)

+----------------------+------------------------+--------+----------------+
|Nom                   |Name                    |Codi ISO|Codi ISO Llengua|
+----------------------+------------------------+--------+----------------+
|Alemanya              |Germany                 |DE      |de              |
|Àustria               |Austria                 |AT      |de              |
|Bèlgica               |Belgium                 |BE      |nl,fr,de        |
|Bulgària              |Bulgaria                |BG      |bg              |
|Xipre                 |Cyprus                  |CY      |el,tr           |
|Croàcia               |Croatia                 |HR      |hr              |
|Dinamarca             |Denmark                 |DK      |da              |
|Eslovàquia            |Slovakia                |SK      |sk              |
|Eslovènia             |Slovenia                |SI      |sl              |
|Espanya               |Spain                   |ES      |es              |
|Estònia    

In [12]:
print(table_ue.count())

NameError: name 'table_ue' is not defined

In [53]:
iso_us.show()

+--------------------+--------------------+--------+
|                 Nom|                Name|Codi ISO|
+--------------------+--------------------+--------+
|Estats Units d'Am...|United States of ...|      US|
+--------------------+--------------------+--------+



In [71]:
# copy presidents fom locat to hdfs
!hdfs dfs -put /home/$USER/TFM/data/president  /TFM/president

In [7]:
!hdfs dfs -ls /TFM/president | awk '{print $NF}'

items
/TFM/president/presidents-at.csv
/TFM/president/presidents-be.csv
/TFM/president/presidents-bg.csv
/TFM/president/presidents-cy.csv
/TFM/president/presidents-cz.csv
/TFM/president/presidents-de.csv
/TFM/president/presidents-dk.csv
/TFM/president/presidents-ee.csv
/TFM/president/presidents-es.csv
/TFM/president/presidents-fi.csv
/TFM/president/presidents-fr.csv
/TFM/president/presidents-gb.csv
/TFM/president/presidents-gr.csv
/TFM/president/presidents-hr.csv
/TFM/president/presidents-hu.csv
/TFM/president/presidents-ie.csv
/TFM/president/presidents-it.csv
/TFM/president/presidents-lt.csv
/TFM/president/presidents-lu.csv
/TFM/president/presidents-lv.csv
/TFM/president/presidents-mt.csv
/TFM/president/presidents-nl.csv
/TFM/president/presidents-pl.csv
/TFM/president/presidents-pt.csv
/TFM/president/presidents-ro.csv
/TFM/president/presidents-se.csv
/TFM/president/presidents-si.csv
/TFM/president/presidents-sk.csv
/TFM/president/presidents-turquia.csv
/TFM/president/presidents-us.csv

In [None]:
list_file_presi = !hdfs dfs -ls /TFM/president | awk '{print $NF}'
list_file_presi = list_file_presi[1:]
print(list_file_presi)


['/TFM/president/presidents-at.csv', '/TFM/president/presidents-be.csv', '/TFM/president/presidents-bg.csv', '/TFM/president/presidents-cy.csv', '/TFM/president/presidents-cz.csv', '/TFM/president/presidents-de.csv', '/TFM/president/presidents-dk.csv', '/TFM/president/presidents-ee.csv', '/TFM/president/presidents-es.csv', '/TFM/president/presidents-fi.csv', '/TFM/president/presidents-fr.csv', '/TFM/president/presidents-gb.csv', '/TFM/president/presidents-gr.csv', '/TFM/president/presidents-hr.csv', '/TFM/president/presidents-hu.csv', '/TFM/president/presidents-ie.csv', '/TFM/president/presidents-it.csv', '/TFM/president/presidents-lt.csv', '/TFM/president/presidents-lu.csv', '/TFM/president/presidents-lv.csv', '/TFM/president/presidents-mt.csv', '/TFM/president/presidents-nl.csv', '/TFM/president/presidents-pl.csv', '/TFM/president/presidents-pt.csv', '/TFM/president/presidents-ro.csv', '/TFM/president/presidents-se.csv', '/TFM/president/presidents-si.csv', '/TFM/president/presidents-

In [None]:
# Load csv de presidents paisos com taules
for file in list_file_presi:
    name_file = np.array(file.split("/"))[-1][:-4]
    name_file = re.sub(r'-', '_', name_file)
    file_presis = sqlContext.read.csv(file, header=True, inferSchema=True, sep=',')
    file_presis.write.saveAsTable(name_file)
 

In [14]:
print(my_spark.catalog.listTables())   

[]


In [13]:
table_ue=my_spark.sql("select * from presidents_turquia")
table_ue.show(n=1000, truncate=False)

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `presidents_turquia` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [presidents_turquia], [], false


In [15]:
# Passar noticies descarregades diariament per començar, despres s'aniran afe
# afegint desde el py del consumer.
!hdfs dfs -put /home/$USER/TFM/data/news  /TFM/news

In [27]:
!hdfs dfs -ls  /TFM/news/news_r*.csv

-rw-r--r--   1 roser supergroup     294748 2025-04-03 17:24 /TFM/news/news_ro.csv


In [58]:
!hdfs dfs -put /home/$USER/TFM/data/news/news_us.csv /TFM/news/news_us/part-00010.csv

In [34]:
!hdfs dfs -rm /TFM/news/news_ro.csv

Deleted /TFM/news/news_ro.csv


In [15]:
df_ro = my_spark.read.csv("/TFM/news/news_us/", header=True, inferSchema=True)
df_ro.show()  # Mostra les primeres files

NameError: name 'my_spark' is not defined

In [60]:
df_ro.count()

430

In [8]:
df_ro = my_spark.read.csv("/TFM/news/news_es/", header=True, inferSchema=True)
df_ro.count()  # Mostra les primeres files

2159

In [None]:
file = "/TFM/news/news_be.csv"
name_dir = "/TFM/news/news_be"
!hdfs dfs -cp {file} {name_dir}

In [47]:
!hdfs dfs -rm {file}

Deleted /TFM/news/news_be.csv


In [52]:
list_file_news = !hdfs dfs -ls /TFM/news/news*.csv | awk '{print $NF}'
list_file_news = list_file_news[1:]
print(list_file_news)

# Load csv de news as part
for file in list_file_news:
    name_dir = file[:-4]+'/part-00010.csv'
    !hdfs dfs -cp {file} {name_dir}
    !hdfs dfs -rm {file}


['/TFM/news/news_bg.csv', '/TFM/news/news_cy.csv', '/TFM/news/news_cz.csv', '/TFM/news/news_de.csv', '/TFM/news/news_dk.csv', '/TFM/news/news_ee.csv', '/TFM/news/news_en.csv', '/TFM/news/news_es.csv', '/TFM/news/news_fi.csv', '/TFM/news/news_fr.csv', '/TFM/news/news_gb.csv', '/TFM/news/news_gr.csv', '/TFM/news/news_hr.csv', '/TFM/news/news_hu.csv', '/TFM/news/news_ie.csv', '/TFM/news/news_it.csv', '/TFM/news/news_lt.csv', '/TFM/news/news_lu.csv', '/TFM/news/news_lv.csv', '/TFM/news/news_mt.csv', '/TFM/news/news_nl.csv', '/TFM/news/news_pl.csv', '/TFM/news/news_pt.csv', '/TFM/news/news_se.csv', '/TFM/news/news_si.csv', '/TFM/news/news_sk.csv', '/TFM/news/news_us.csv']
Deleted /TFM/news/news_bg.csv
Deleted /TFM/news/news_cy.csv
Deleted /TFM/news/news_cz.csv
Deleted /TFM/news/news_de.csv
Deleted /TFM/news/news_dk.csv
Deleted /TFM/news/news_ee.csv
cp: `/TFM/news/news_en/part-00010.csv': No such file or directory: `hdfs://0.0.0.0:9000/TFM/news/news_en/part-00010.csv'
Deleted /TFM/news/news_

In [61]:
!hdfs dfs -ls  /TFM/

Found 5 items
drwxr-xr-x   - roser supergroup          0 2025-04-04 10:03 /TFM/news
drwxr-xr-x   - roser supergroup          0 2025-04-02 14:11 /TFM/old_news
-rw-r--r--   1 roser supergroup        108 2025-04-02 18:13 /TFM/pais-us.csv
-rw-r--r--   1 roser supergroup        992 2025-04-02 13:26 /TFM/paisos-ue.csv
drwxr-xr-x   - roser supergroup          0 2025-04-02 18:20 /TFM/president


In [62]:
!hdfs dfs -ls  /TFM/news

Found 30 items
drwxr-xr-x   - roser supergroup          0 2025-04-04 09:06 /TFM/news/news_at
-rw-r--r--   1 roser supergroup     182210 2025-04-03 17:25 /TFM/news/news_at.csv
drwxr-xr-x   - roser supergroup          0 2025-04-04 09:42 /TFM/news/news_be
drwxr-xr-x   - roser supergroup          0 2025-04-04 09:47 /TFM/news/news_bg
drwxr-xr-x   - roser supergroup          0 2025-04-04 09:47 /TFM/news/news_cy
drwxr-xr-x   - roser supergroup          0 2025-04-04 09:47 /TFM/news/news_cz
drwxr-xr-x   - roser supergroup          0 2025-04-04 09:47 /TFM/news/news_de
drwxr-xr-x   - roser supergroup          0 2025-04-04 09:47 /TFM/news/news_dk
drwxr-xr-x   - roser supergroup          0 2025-04-04 09:48 /TFM/news/news_ee
drwxr-xr-x   - roser supergroup          0 2025-04-04 09:48 /TFM/news/news_es
drwxr-xr-x   - roser supergroup          0 2025-04-04 09:48 /TFM/news/news_fi
drwxr-xr-x   - roser supergroup          0 2025-04-04 09:48 /TFM/news/news_fr
drwxr-xr-x   - roser supergroup          0 20

In [None]:
# Check data old news (timestamp)
f_old_news = my_spark.read.parquet("/TFM/old_news", header=True, inferSchema=True)
f_old_news.show()  # Mostra les primeres files



+--------+--------------------+--------------------+--------------------+
|iso_code|               title|         description|             pubDate|
+--------+--------------------+--------------------+--------------------+
|      us|Likes Taking Risk...|EVAN SHEFTEL of N...|2008-10-01T00:01:...|
|      us|For Oktoberfest, ...|I MUST confess th...|2008-10-01T00:03:...|
|      us|Tighter Credit On...|DETROIT  After en...|2008-10-01T00:04:...|
|      us|And Then There Wa...|SINCE his death l...|2008-10-01T00:06:...|
|      us|At the Parker Mer...|The name of the n...|2008-10-01T00:09:...|
|      us|Awaiting Palins P...|Those waiting for...|2008-10-01T00:11:...|
|      us|Tunisian Couscous...|Hand-rolled cousc...|2008-10-01T00:11:...|
|      us|David Jones, Film...|David Jones, a ce...|2008-10-01T00:12:...|
|      us|Serious Tool Budd...|These colorful si...|2008-10-01T00:12:...|
|      us|   Names of the Dead|The Department of...|2008-10-01T00:12:...|
|      us|Swiss Said to Be ...|The vei

                                                                                

In [74]:
f_old_news.schema

StructType([StructField('iso_code', StringType(), True), StructField('title', StringType(), True), StructField('description', StringType(), True), StructField('lang_tranlation', StringType(), True), StructField('title_translated', StringType(), True), StructField('description_translated', StringType(), True), StructField('pubDate', StringType(), True)])

In [69]:
from pyspark.sql.functions import to_timestamp
f_old_news = f_old_news.withColumn("pubDate", to_timestamp(f_old_news["pubDate"], "yyyy-MM-dd'T'HH:mm:ss'Z'"))

In [70]:
f_old_news.schema

StructType([StructField('iso_code', StringType(), True), StructField('title', StringType(), True), StructField('description', StringType(), True), StructField('pubDate', TimestampType(), True)])

In [2]:
import sparknlp
from sparknlp.pretrained import PretrainedPipeline

In [None]:
# Inicia Spark NLP
spark_nlp = sparknlp.start()

pipeline = PretrainedPipeline('explain_document_ml', lang='en')
 

your 131072x1 screen size is bogus. expect trouble


:: loading settings :: url = jar:file:/home/roser/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/roser/.ivy2/cache
The jars for the packages stored in: /home/roser/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-14b2985f-745a-410e-a702-c94bc72ace37;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.5.3 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-s3;1.12.500 in central
	found com.amazonaws#aws-java-sdk-kms;1.12.500 in central
	found com.amazonaws#aws-java-sdk-core;1.12.500 in central
	found commons-logging#commons-logging;1.1.3 in central
	found commons-codec#commons-codec;1.15 in central
	found org.apache.httpcomponents#httpclient;4.5.13 in central
	found org.apache.httpcomponents#httpcore;4.4.13 in central
	found software.amazon.ion#ion-java;1.0.2 in central
	found joda-time#joda-time;2.8.1 in central
	found com.amazonaws#jmespath-java;1.12.500 in central


explain_document_ml download started this may take some time.
Approx size to download 9 MB
[ | ]

25/04/04 11:36:42 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.
25/04/04 11:36:42 WARN S3AbortableInputStream: Not all bytes were read from the S3ObjectInputStream, aborting HTTP connection. This is likely an error and may result in sub-optimal behavior. Request only the bytes you need via a ranged GET or drain the input stream after use.


explain_document_ml download started this may take some time.
Approximate size to download 9 MB
[ / ]Download done! Loading the resource.
[ \ ]

[Stage 12:>                                                         (0 + 7) / 8]

[ | ]

                                                                                

[ — ]

                                                                                

[OK!]


In [122]:

news_text = "The Supreme Court heard arguments on\xa0whether South Carolina can't remove Planned Parenthood clinics from its state Medicaid program, even though those funds cannot generally be used to fund abortions."
pip_dict = pipeline.annotate(news_text)
for key, value in pip_dict.items():
    print(f"{key}: {value}")

document: ["The Supreme Court heard arguments on\xa0whether South Carolina can't remove Planned Parenthood clinics from its state Medicaid program, even though those funds cannot generally be used to fund abortions."]
spell: ['The', 'Supreme', 'Court', 'heard', 'arguments', 'on\xa0whether', 'South', 'Carolina', "can't", 'remove', 'Planned', 'Parenthood', 'clinics', 'from', 'its', 'state', 'Medicaid', 'program', ',', 'even', 'though', 'those', 'funds', 'cannot', 'generally', 'be', 'used', 'to', 'fund', 'abortions', '.']
pos: ['DT', 'NNP', 'NNP', 'VBD', 'NNS', 'RB', 'NNP', 'NNP', 'VBD', 'VB', 'VBN', 'NNP', 'NNS', 'IN', 'PRP$', 'NN', 'NNP', 'NN', ',', 'RB', 'IN', 'DT', 'NNS', 'NN', 'RB', 'VB', 'VBN', 'TO', 'VB', 'NNS', '.']
lemmas: ['The', 'Supreme', 'Court', 'hear', 'argument', 'on\xa0whether', 'South', 'Carolina', "can't", 'remove', 'Planned', 'Parenthood', 'clinic', 'from', 'it', 'state', 'Medicaid', 'program', ',', 'even', 'though', 'those', 'fund', 'cannot', 'generally', 'be', 'use',

In [None]:

# Check data old news (timestamp)
f_old_news = my_spark.read.csv("/TFM/news/news_es", header=False, schema=schema_news)
f_old_news.show(5)  # Mostra les primeres files
f_old_news.printSchema()

+--------+--------------------+--------------------+--------------------+--------------------+----------------------+-------------------+
|iso_code|               title|         description|     lang_tranlation|    title_translated|description_translated|            pubDate|
+--------+--------------------+--------------------+--------------------+--------------------+----------------------+-------------------+
|iso_code|               title|         description|     lang_tranlation|    title_translated|  description_trans...|               NULL|
|      es|El Sala Ourense s...|El Sala Ourense c...|                  en|The Ourense room ...|  The Ourense room ...|2025-03-18 05:04:00|
|      es|"El juguete que t...|"El Furby fue uno...| que recordaba a ...| lo hizo irresist...|   se vendieron 40 ...|               NULL|
|      es|El dineral que ga...|Más concretament...|                  en|The money that Il...|  More specifically...|2025-03-18 05:30:00|
|      es|Aitana y la perve...|Ant

In [None]:

schema_old_news = StructType([
    StructField("iso_code", StringType(), True),
    StructField("title", StringType(), True),
    StructField("description", StringType(), True),
    StructField("pubDate", TimestampType(), True)
])

# Check data old news (timestamp)
f_old_news = my_spark.read.csv("/TFM/old_news", header=False, schema=schema_old_news)
f_old_news.show()  # Mostra les primeres files

+--------+--------------------+--------------------+-------------------+
|iso_code|               title|         description|            pubDate|
+--------+--------------------+--------------------+-------------------+
|iso_code|               title|         description|               NULL|
|      us|Likes Taking Risk...|EVAN SHEFTEL of N...|2008-10-01 02:01:06|
|      us|For Oktoberfest, ...|I MUST confess th...|2008-10-01 02:03:26|
|      us|Tighter Credit On...|DETROIT  After en...|2008-10-01 02:04:40|
|      us|And Then There Wa...|SINCE his death l...|2008-10-01 02:06:24|
|      us|At the Parker Mer...|The name of the n...|2008-10-01 02:09:15|
|      us|Awaiting Palins P...|Those waiting for...|2008-10-01 02:11:01|
|      us|Tunisian Couscous...|Hand-rolled cousc...|2008-10-01 02:11:39|
|      us|David Jones, Film...|David Jones, a ce...|2008-10-01 02:12:25|
|      us|Serious Tool Budd...|These colorful si...|2008-10-01 02:12:25|
|      us|   Names of the Dead|The Department of...

In [63]:
!ls /home/$USER/TFM/data/old_news

1970_10_news_us_old.csv  1988_11_news_us_old.csv  2006_12_news_us_old.csv
1970_11_news_us_old.csv  1988_12_news_us_old.csv  2006_1_news_us_old.csv
1970_12_news_us_old.csv  1988_1_news_us_old.csv   2006_2_news_us_old.csv
1970_1_news_us_old.csv	 1988_2_news_us_old.csv   2006_3_news_us_old.csv
1970_2_news_us_old.csv	 1988_3_news_us_old.csv   2006_4_news_us_old.csv
1970_3_news_us_old.csv	 1988_4_news_us_old.csv   2006_5_news_us_old.csv
1970_4_news_us_old.csv	 1988_5_news_us_old.csv   2006_6_news_us_old.csv
1970_5_news_us_old.csv	 1988_6_news_us_old.csv   2006_7_news_us_old.csv
1970_6_news_us_old.csv	 1988_7_news_us_old.csv   2006_8_news_us_old.csv
1970_7_news_us_old.csv	 1988_8_news_us_old.csv   2006_9_news_us_old.csv
1970_8_news_us_old.csv	 1988_9_news_us_old.csv   2007_10_news_us_old.csv
1970_9_news_us_old.csv	 1989_10_news_us_old.csv  2007_11_news_us_old.csv
1971_10_news_us_old.csv  1989_11_news_us_old.csv  2007_12_news_us_old.csv
1971_11_news_us_old.csv  1989_12_news_us_old.csv  2007_1

In [7]:
!hdfs dfs -rm -r /TFM/news/news_*/*

Deleted /TFM/news/news_be/_SUCCESS
Deleted /TFM/news/news_be/part-00000-a39e9def-e205-4d3a-9b15-a237f4a53e70-c000.snappy.parquet
Deleted /TFM/news/news_bg/_SUCCESS
Deleted /TFM/news/news_bg/part-00000-eccb6bde-e374-494f-9011-66d01b3f74b2-c000.snappy.parquet
Deleted /TFM/news/news_cy/_SUCCESS
Deleted /TFM/news/news_cy/part-00000-e929b9eb-ef47-4cd7-942b-0b9a1a99eae3-c000.snappy.parquet
Deleted /TFM/news/news_cz/_SUCCESS
Deleted /TFM/news/news_cz/part-00000-4958b2fa-54e4-402e-9603-61f9ff743446-c000.snappy.parquet
Deleted /TFM/news/news_de/_SUCCESS
Deleted /TFM/news/news_de/part-00000-debbb1d5-70ee-425a-9507-bc3067f33560-c000.snappy.parquet
Deleted /TFM/news/news_dk/_SUCCESS
Deleted /TFM/news/news_dk/part-00000-3d2cbefd-c952-469e-8fbf-e020fd9f9aa7-c000.snappy.parquet
Deleted /TFM/news/news_ee/_SUCCESS
Deleted /TFM/news/news_ee/part-00000-18f540ea-3cb3-43a1-b30b-e23cb8c322c4-c000.snappy.parquet
Deleted /TFM/news/news_es/_SUCCESS
Deleted /TFM/news/news_es/part-00000-0d6878d3-7119-4844-98cb-8

In [10]:
!hdfs dfs -ls /TFM/news/news_at/

Found 2 items
-rw-r--r--   1 roser supergroup          0 2025-04-10 09:33 /TFM/news/news_at/_SUCCESS
-rw-r--r--   1 roser supergroup     170909 2025-04-10 09:33 /TFM/news/news_at/part-00000-d104724c-ae54-4544-9f83-a59bda084a1f-c000.snappy.parquet


In [9]:
from pyspark.sql.functions import col

# Carregar noticies europees diaries a hdfs com a parquet a cada directori de pais
list_file_news = !ls /home/$USER/TFM/data/news | awk '{print $NF}'
print(list_file_news)

# Load csv de news as parquet
for file in list_file_news:
    name_dir = file[:-4]
    df = my_spark.read.option("header", "true").schema(schema_news).csv("file:///home/roser/TFM/data/news/"+file)
    df.show(n=3)
    #df_clean = df.filter(col("title_translated").isNotNull() & col("pubDate").isNotNull())
    df.write.mode("append").parquet("/TFM/news/"+name_dir)
    
    

['news_at.csv', 'news_be.csv', 'news_bg.csv', 'news_cy.csv', 'news_cz.csv', 'news_de.csv', 'news_dk.csv', 'news_ee.csv', 'news_es.csv', 'news_fi.csv', 'news_fr.csv', 'news_gb.csv', 'news_gr.csv', 'news_hr.csv', 'news_hu.csv', 'news_ie.csv', 'news_it.csv', 'news_lt.csv', 'news_lu.csv', 'news_lv.csv', 'news_mt.csv', 'news_nl.csv', 'news_pl.csv', 'news_pt.csv', 'news_ro.csv', 'news_se.csv', 'news_si.csv', 'news_sk.csv', 'news_us.csv']
+--------+--------------------+--------------------+--------------------+--------------------+----------------------+-------------------+
|iso_code|               title|         description|     lang_tranlation|    title_translated|description_translated|            pubDate|
+--------+--------------------+--------------------+--------------------+--------------------+----------------------+-------------------+
|      at|"Salehpour inszen...|"Ein junger ameri...| den weltweiten R...|                  en|  "Salehpour stages...|               NULL|
|      at|Se

                                                                                

+--------+--------------------+--------------------+---------------+--------------------+----------------------+-------------------+
|iso_code|               title|         description|lang_tranlation|    title_translated|description_translated|            pubDate|
+--------+--------------------+--------------------+---------------+--------------------+----------------------+-------------------+
|      be|"Wesley Sonck na ...|                NULL|             en|"Wesley Sonck Na ...|                  NULL|2025-03-18 05:03:00|
|      be|Begroting met ove...|De gemeenteraad v...|             en|Budget with surpl...|  The Sint-Lambrech...|2025-03-18 05:24:42|
|      be|220 doden in Gaza...|Bij de nachtelijk...|             en|220 dead in Gaza ...|  At least 220 peop...|2025-03-18 05:20:13|
+--------+--------------------+--------------------+---------------+--------------------+----------------------+-------------------+
only showing top 3 rows

+--------+--------------------+-------------

In [61]:
# Check data old news (timestamp)
f_old_news = my_spark.read.parquet("/TFM/news/news_es")
print(f_old_news.count())  # Mostra les primeres files
f_old_news.printSchema()

322
root
 |-- iso_code: string (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- lang_tranlation: string (nullable = true)
 |-- title_translated: string (nullable = true)
 |-- description_translated: string (nullable = true)
 |-- pubDate: timestamp (nullable = true)



In [1]:
# Eliminar fitxerts parquet
!hdfs dfs -rm -r /TFM/old_news/*

Deleted /TFM/old_news/_SUCCESS
Deleted /TFM/old_news/part-00000-008c1409-2f3d-4423-8476-521436ba1fd2-c000.snappy.parquet
Deleted /TFM/old_news/part-00000-00b6891d-4566-419a-a68c-41db5e001862-c000.snappy.parquet
Deleted /TFM/old_news/part-00000-00b90e1f-d92d-46a5-ad4e-90efea91c5a6-c000.snappy.parquet
Deleted /TFM/old_news/part-00000-01160f2b-fe80-47aa-b591-32d94f28e0d7-c000.snappy.parquet
Deleted /TFM/old_news/part-00000-013395d5-35f9-4057-b538-af98dfb8a4fb-c000.snappy.parquet
Deleted /TFM/old_news/part-00000-01a8a271-8e53-478f-9b17-5ee43adbdccb-c000.snappy.parquet
Deleted /TFM/old_news/part-00000-01f2d7ef-67b8-41ce-b6d3-14bf29ab4795-c000.snappy.parquet
Deleted /TFM/old_news/part-00000-01fa1efa-9edf-400c-8c1d-978859575d9d-c000.snappy.parquet
Deleted /TFM/old_news/part-00000-02584c64-6b1a-4e48-8ad5-fac8c9d0badd-c000.snappy.parquet
Deleted /TFM/old_news/part-00000-028251f4-e889-4048-bc9b-bd91198084e7-c000.snappy.parquet
Deleted /TFM/old_news/part-00000-02a040e7-fc97-4957-87f7-4e28c9350d44

In [7]:
# Validar dir empty
!hdfs dfs -ls /TFM/old_news/

In [None]:
from pyspark.sql.functions import col, to_date, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType

# Carregar noticies antigues New York Times  a hdfs com a parquet al directori de noticies antigues
list_file_news = !ls /home/$USER/TFM/data/old_news | awk '{print $NF}'
list_file_news = list_file_news[1:]

schema_old_news = StructType([
    StructField("iso_code", StringType(), True),
    StructField("title", StringType(), True),
    StructField("description", StringType(), True),
    StructField("pubDate", StringType(), True)
])

# Load csv de old news as parquet
for file in list_file_news:
    print(file)
   
    df = my_spark.read.option("header", "true").schema(schema_old_news).csv("file:///home/roser/TFM/data/old_news/"+file)

    # to check format data to avoid errors data old 1900-01-01 convertim a timestamp i despres ens quedem sols amb la data
    df = df.withColumn("pubDate", to_timestamp(col("pubDate"), "yyyy-MM-dd'T'HH:mm:ssZ"))
    df = df.withColumn("pubDate", to_date(col("pubDate")))  # només la part de la data
    # Filtrar les dates mal formades
    df_filtered = df.filter(col("pubDate").isNotNull())
    df_filtered.write.mode("append").parquet("/TFM/old_news/")

1970_11_news_us_old.csv


                                                                                

1970_12_news_us_old.csv
1970_1_news_us_old.csv


                                                                                

1970_2_news_us_old.csv
1970_3_news_us_old.csv


                                                                                

1970_4_news_us_old.csv


                                                                                

1970_5_news_us_old.csv
1970_6_news_us_old.csv
1970_7_news_us_old.csv
1970_8_news_us_old.csv
1970_9_news_us_old.csv
1971_10_news_us_old.csv
1971_11_news_us_old.csv
1971_12_news_us_old.csv
1971_1_news_us_old.csv
1971_2_news_us_old.csv
1971_3_news_us_old.csv
1971_4_news_us_old.csv
1971_5_news_us_old.csv
1971_6_news_us_old.csv
1971_7_news_us_old.csv
1971_8_news_us_old.csv
1971_9_news_us_old.csv


                                                                                

1972_10_news_us_old.csv
1972_11_news_us_old.csv
1972_12_news_us_old.csv


                                                                                

1972_1_news_us_old.csv
1972_2_news_us_old.csv
1972_3_news_us_old.csv


                                                                                

1972_4_news_us_old.csv
1972_5_news_us_old.csv
1972_6_news_us_old.csv
1972_7_news_us_old.csv


                                                                                

1972_8_news_us_old.csv
1972_9_news_us_old.csv
1973_10_news_us_old.csv
1973_11_news_us_old.csv


                                                                                

1973_12_news_us_old.csv
1973_1_news_us_old.csv
1973_2_news_us_old.csv
1973_3_news_us_old.csv
1973_4_news_us_old.csv


                                                                                

1973_5_news_us_old.csv
1973_6_news_us_old.csv
1973_7_news_us_old.csv
1973_8_news_us_old.csv
1973_9_news_us_old.csv
1974_10_news_us_old.csv
1974_11_news_us_old.csv
1974_12_news_us_old.csv
1974_1_news_us_old.csv
1974_2_news_us_old.csv
1974_3_news_us_old.csv
1974_4_news_us_old.csv
1974_5_news_us_old.csv


                                                                                

1974_6_news_us_old.csv
1974_7_news_us_old.csv
1974_8_news_us_old.csv
1974_9_news_us_old.csv
1975_10_news_us_old.csv
1975_11_news_us_old.csv
1975_12_news_us_old.csv
1975_1_news_us_old.csv
1975_2_news_us_old.csv
1975_3_news_us_old.csv
1975_4_news_us_old.csv
1975_5_news_us_old.csv
1975_6_news_us_old.csv


                                                                                

1975_7_news_us_old.csv
1975_8_news_us_old.csv
1975_9_news_us_old.csv
1976_10_news_us_old.csv
1976_11_news_us_old.csv
1976_12_news_us_old.csv
1976_1_news_us_old.csv
1976_2_news_us_old.csv
1976_3_news_us_old.csv
1976_4_news_us_old.csv
1976_5_news_us_old.csv
1976_6_news_us_old.csv
1976_7_news_us_old.csv
1976_8_news_us_old.csv
1976_9_news_us_old.csv
1977_10_news_us_old.csv
1977_11_news_us_old.csv
1977_12_news_us_old.csv
1977_1_news_us_old.csv
1977_2_news_us_old.csv
1977_3_news_us_old.csv
1977_4_news_us_old.csv
1977_5_news_us_old.csv
1977_6_news_us_old.csv
1977_7_news_us_old.csv
1977_8_news_us_old.csv
1977_9_news_us_old.csv
1978_10_news_us_old.csv
1978_11_news_us_old.csv
1978_12_news_us_old.csv
1978_1_news_us_old.csv
1978_2_news_us_old.csv
1978_3_news_us_old.csv
1978_4_news_us_old.csv
1978_5_news_us_old.csv
1978_6_news_us_old.csv
1978_7_news_us_old.csv
1978_8_news_us_old.csv
1978_9_news_us_old.csv
1979_10_news_us_old.csv
1979_11_news_us_old.csv
1979_12_news_us_old.csv
1979_1_news_us_old.csv

                                                                                

1979_7_news_us_old.csv
1979_8_news_us_old.csv
1979_9_news_us_old.csv
1980_10_news_us_old.csv
1980_11_news_us_old.csv
1980_12_news_us_old.csv
1980_1_news_us_old.csv
1980_2_news_us_old.csv
1980_3_news_us_old.csv
1980_4_news_us_old.csv
1980_5_news_us_old.csv
1980_6_news_us_old.csv
1980_7_news_us_old.csv
1980_8_news_us_old.csv
1980_9_news_us_old.csv
1981_10_news_us_old.csv
1981_11_news_us_old.csv
1981_12_news_us_old.csv
1981_1_news_us_old.csv
1981_2_news_us_old.csv
1981_3_news_us_old.csv
1981_4_news_us_old.csv
1981_5_news_us_old.csv
1981_6_news_us_old.csv
1981_7_news_us_old.csv
1981_8_news_us_old.csv
1981_9_news_us_old.csv
1982_10_news_us_old.csv
1982_11_news_us_old.csv
1982_12_news_us_old.csv
1982_1_news_us_old.csv
1982_2_news_us_old.csv
1982_3_news_us_old.csv
1982_4_news_us_old.csv
1982_5_news_us_old.csv
1982_6_news_us_old.csv
1982_7_news_us_old.csv
1982_8_news_us_old.csv
1982_9_news_us_old.csv
1983_10_news_us_old.csv
1983_11_news_us_old.csv
1983_12_news_us_old.csv
1983_1_news_us_old.csv

                                                                                

1986_10_news_us_old.csv


                                                                                

1986_11_news_us_old.csv
1986_12_news_us_old.csv
1986_1_news_us_old.csv
1986_2_news_us_old.csv
1986_3_news_us_old.csv
1986_4_news_us_old.csv
1986_5_news_us_old.csv
1986_6_news_us_old.csv
1986_7_news_us_old.csv
1986_8_news_us_old.csv
1986_9_news_us_old.csv
1987_10_news_us_old.csv
1987_11_news_us_old.csv
1987_12_news_us_old.csv
1987_1_news_us_old.csv
1987_2_news_us_old.csv
1987_3_news_us_old.csv
1987_4_news_us_old.csv
1987_5_news_us_old.csv
1987_6_news_us_old.csv
1987_7_news_us_old.csv
1987_8_news_us_old.csv
1987_9_news_us_old.csv
1988_10_news_us_old.csv
1988_11_news_us_old.csv
1988_12_news_us_old.csv
1988_1_news_us_old.csv
1988_2_news_us_old.csv
1988_3_news_us_old.csv
1988_4_news_us_old.csv
1988_5_news_us_old.csv
1988_6_news_us_old.csv
1988_7_news_us_old.csv
1988_8_news_us_old.csv
1988_9_news_us_old.csv
1989_10_news_us_old.csv
1989_11_news_us_old.csv
1989_12_news_us_old.csv
1989_1_news_us_old.csv
1989_2_news_us_old.csv
1989_3_news_us_old.csv


                                                                                

1989_4_news_us_old.csv
1989_5_news_us_old.csv
1989_6_news_us_old.csv
1989_7_news_us_old.csv
1989_8_news_us_old.csv
1989_9_news_us_old.csv
1990_10_news_us_old.csv
1990_11_news_us_old.csv
1990_12_news_us_old.csv
1990_1_news_us_old.csv
1990_2_news_us_old.csv
1990_3_news_us_old.csv
1990_4_news_us_old.csv
1990_5_news_us_old.csv
1990_6_news_us_old.csv
1990_7_news_us_old.csv
1990_8_news_us_old.csv
1990_9_news_us_old.csv
1991_10_news_us_old.csv
1991_11_news_us_old.csv
1991_12_news_us_old.csv
1991_1_news_us_old.csv
1991_2_news_us_old.csv
1991_3_news_us_old.csv
1991_4_news_us_old.csv
1991_5_news_us_old.csv
1991_6_news_us_old.csv
1991_7_news_us_old.csv
1991_8_news_us_old.csv
1991_9_news_us_old.csv
1992_10_news_us_old.csv


                                                                                

1992_11_news_us_old.csv
1992_12_news_us_old.csv
1992_1_news_us_old.csv
1992_2_news_us_old.csv
1992_3_news_us_old.csv
1992_4_news_us_old.csv
1992_5_news_us_old.csv
1992_6_news_us_old.csv
1992_7_news_us_old.csv


                                                                                

1992_8_news_us_old.csv
1992_9_news_us_old.csv
1993_10_news_us_old.csv
1993_11_news_us_old.csv
1993_12_news_us_old.csv
1993_1_news_us_old.csv
1993_2_news_us_old.csv
1993_3_news_us_old.csv
1993_4_news_us_old.csv
1993_5_news_us_old.csv
1993_6_news_us_old.csv
1993_7_news_us_old.csv
1993_8_news_us_old.csv
1993_9_news_us_old.csv
1994_10_news_us_old.csv
1994_11_news_us_old.csv
1994_12_news_us_old.csv
1994_1_news_us_old.csv
1994_2_news_us_old.csv
1994_3_news_us_old.csv
1994_4_news_us_old.csv
1994_5_news_us_old.csv
1994_6_news_us_old.csv
1994_7_news_us_old.csv
1994_8_news_us_old.csv
1994_9_news_us_old.csv
1995_10_news_us_old.csv


                                                                                

1995_11_news_us_old.csv
1995_12_news_us_old.csv
1995_1_news_us_old.csv
1995_2_news_us_old.csv
1995_3_news_us_old.csv
1995_4_news_us_old.csv


                                                                                

1995_5_news_us_old.csv
1995_6_news_us_old.csv
1995_7_news_us_old.csv
1995_8_news_us_old.csv
1995_9_news_us_old.csv
1996_10_news_us_old.csv
1996_11_news_us_old.csv
1996_12_news_us_old.csv
1996_1_news_us_old.csv


                                                                                

1996_2_news_us_old.csv
1996_3_news_us_old.csv
1996_4_news_us_old.csv
1996_5_news_us_old.csv


                                                                                

1996_6_news_us_old.csv
1996_7_news_us_old.csv


                                                                                

1996_8_news_us_old.csv


                                                                                

1996_9_news_us_old.csv


                                                                                

1997_10_news_us_old.csv
1997_11_news_us_old.csv
1997_12_news_us_old.csv


                                                                                

1997_1_news_us_old.csv


                                                                                

1997_2_news_us_old.csv
1997_3_news_us_old.csv


                                                                                

1997_4_news_us_old.csv


                                                                                

1997_5_news_us_old.csv
1997_6_news_us_old.csv


                                                                                

1997_7_news_us_old.csv
1997_8_news_us_old.csv


                                                                                

1997_9_news_us_old.csv
1998_10_news_us_old.csv
1998_11_news_us_old.csv


                                                                                

1998_12_news_us_old.csv
1998_1_news_us_old.csv


                                                                                

1998_2_news_us_old.csv


                                                                                

1998_3_news_us_old.csv


                                                                                

1998_4_news_us_old.csv


                                                                                

1998_5_news_us_old.csv
1998_6_news_us_old.csv
1998_7_news_us_old.csv


                                                                                

1998_8_news_us_old.csv


                                                                                

1998_9_news_us_old.csv
1999_10_news_us_old.csv
1999_11_news_us_old.csv
1999_12_news_us_old.csv
1999_1_news_us_old.csv
1999_2_news_us_old.csv
1999_3_news_us_old.csv
1999_4_news_us_old.csv
1999_5_news_us_old.csv
1999_6_news_us_old.csv
1999_7_news_us_old.csv
1999_8_news_us_old.csv
1999_9_news_us_old.csv
2000_10_news_us_old.csv
2000_11_news_us_old.csv
2000_12_news_us_old.csv
2000_1_news_us_old.csv
2000_2_news_us_old.csv
2000_3_news_us_old.csv
2000_4_news_us_old.csv
2000_5_news_us_old.csv
2000_6_news_us_old.csv


                                                                                

2000_7_news_us_old.csv
2000_8_news_us_old.csv
2000_9_news_us_old.csv
2001_10_news_us_old.csv
2001_11_news_us_old.csv
2001_12_news_us_old.csv
2001_1_news_us_old.csv
2001_2_news_us_old.csv
2001_3_news_us_old.csv
2001_4_news_us_old.csv
2001_5_news_us_old.csv
2001_6_news_us_old.csv
2001_7_news_us_old.csv
2001_8_news_us_old.csv
2001_9_news_us_old.csv
2002_10_news_us_old.csv
2002_11_news_us_old.csv
2002_12_news_us_old.csv
2002_1_news_us_old.csv
2002_2_news_us_old.csv
2002_3_news_us_old.csv
2002_4_news_us_old.csv
2002_5_news_us_old.csv
2002_6_news_us_old.csv
2002_7_news_us_old.csv
2002_8_news_us_old.csv
2002_9_news_us_old.csv
2003_10_news_us_old.csv
2003_11_news_us_old.csv
2003_12_news_us_old.csv
2003_1_news_us_old.csv
2003_2_news_us_old.csv
2003_3_news_us_old.csv
2003_4_news_us_old.csv
2003_5_news_us_old.csv
2003_6_news_us_old.csv


                                                                                

2003_7_news_us_old.csv
2003_8_news_us_old.csv
2003_9_news_us_old.csv
2004_10_news_us_old.csv
2004_11_news_us_old.csv


                                                                                

2004_12_news_us_old.csv


                                                                                

2004_1_news_us_old.csv
2004_2_news_us_old.csv
2004_3_news_us_old.csv
2004_4_news_us_old.csv


                                                                                

2004_5_news_us_old.csv
2004_6_news_us_old.csv
2004_7_news_us_old.csv
2004_8_news_us_old.csv
2004_9_news_us_old.csv
2005_10_news_us_old.csv
2005_11_news_us_old.csv
2005_12_news_us_old.csv


                                                                                

2005_1_news_us_old.csv
2005_2_news_us_old.csv
2005_3_news_us_old.csv
2005_4_news_us_old.csv
2005_5_news_us_old.csv
2005_6_news_us_old.csv
2005_7_news_us_old.csv
2005_8_news_us_old.csv
2005_9_news_us_old.csv
2006_10_news_us_old.csv
2006_11_news_us_old.csv


                                                                                

2006_12_news_us_old.csv
2006_1_news_us_old.csv
2006_2_news_us_old.csv
2006_3_news_us_old.csv
2006_4_news_us_old.csv
2006_5_news_us_old.csv
2006_6_news_us_old.csv


                                                                                

2006_7_news_us_old.csv
2006_8_news_us_old.csv
2006_9_news_us_old.csv
2007_10_news_us_old.csv
2007_11_news_us_old.csv
2007_12_news_us_old.csv
2007_1_news_us_old.csv
2007_2_news_us_old.csv
2007_3_news_us_old.csv
2007_4_news_us_old.csv
2007_5_news_us_old.csv
2007_6_news_us_old.csv
2007_7_news_us_old.csv
2007_8_news_us_old.csv
2007_9_news_us_old.csv
2008_10_news_us_old.csv
2008_11_news_us_old.csv
2008_12_news_us_old.csv
2008_1_news_us_old.csv
2008_2_news_us_old.csv
2008_3_news_us_old.csv
2008_4_news_us_old.csv
2008_5_news_us_old.csv
2008_6_news_us_old.csv
2008_7_news_us_old.csv
2008_8_news_us_old.csv
2008_9_news_us_old.csv
2009_10_news_us_old.csv
2009_11_news_us_old.csv
2009_12_news_us_old.csv
2009_1_news_us_old.csv
2009_2_news_us_old.csv
2009_3_news_us_old.csv
2009_4_news_us_old.csv
2009_5_news_us_old.csv


                                                                                

2009_6_news_us_old.csv
2009_7_news_us_old.csv
2009_8_news_us_old.csv
2009_9_news_us_old.csv
2010_10_news_us_old.csv
2010_11_news_us_old.csv
2010_12_news_us_old.csv
2010_1_news_us_old.csv
2010_2_news_us_old.csv
2010_3_news_us_old.csv
2010_4_news_us_old.csv
2010_5_news_us_old.csv
2010_6_news_us_old.csv
2010_7_news_us_old.csv
2010_8_news_us_old.csv
2010_9_news_us_old.csv
2011_10_news_us_old.csv
2011_11_news_us_old.csv
2011_12_news_us_old.csv
2011_1_news_us_old.csv
2011_2_news_us_old.csv
2011_3_news_us_old.csv
2011_4_news_us_old.csv
2011_5_news_us_old.csv
2011_6_news_us_old.csv
2011_7_news_us_old.csv
2011_8_news_us_old.csv
2011_9_news_us_old.csv
2012_10_news_us_old.csv
2012_11_news_us_old.csv
2012_12_news_us_old.csv
2012_1_news_us_old.csv
2012_2_news_us_old.csv
2012_3_news_us_old.csv
2012_4_news_us_old.csv
2012_5_news_us_old.csv


                                                                                

2012_6_news_us_old.csv
2012_7_news_us_old.csv
2012_8_news_us_old.csv
2012_9_news_us_old.csv
2013_10_news_us_old.csv
2013_11_news_us_old.csv
2013_12_news_us_old.csv
2013_1_news_us_old.csv
2013_2_news_us_old.csv
2013_3_news_us_old.csv


                                                                                

2013_4_news_us_old.csv
2013_5_news_us_old.csv
2013_6_news_us_old.csv


                                                                                

2013_7_news_us_old.csv
2013_8_news_us_old.csv
2013_9_news_us_old.csv
2014_10_news_us_old.csv
2014_11_news_us_old.csv
2014_12_news_us_old.csv
2014_1_news_us_old.csv
2014_2_news_us_old.csv
2014_3_news_us_old.csv
2014_4_news_us_old.csv
2014_5_news_us_old.csv
2014_6_news_us_old.csv
2014_7_news_us_old.csv
2014_8_news_us_old.csv
2014_9_news_us_old.csv
2015_10_news_us_old.csv
2015_11_news_us_old.csv
2015_12_news_us_old.csv
2015_1_news_us_old.csv
2015_2_news_us_old.csv
2015_3_news_us_old.csv
2015_4_news_us_old.csv
2015_5_news_us_old.csv
2015_6_news_us_old.csv


                                                                                

2015_7_news_us_old.csv
2015_8_news_us_old.csv
2015_9_news_us_old.csv
2016_10_news_us_old.csv
2016_11_news_us_old.csv
2016_12_news_us_old.csv
2016_1_news_us_old.csv
2016_2_news_us_old.csv
2016_3_news_us_old.csv
2016_4_news_us_old.csv
2016_5_news_us_old.csv
2016_6_news_us_old.csv
2016_7_news_us_old.csv
2016_8_news_us_old.csv
2016_9_news_us_old.csv
2017_10_news_us_old.csv
2017_11_news_us_old.csv
2017_12_news_us_old.csv
2017_1_news_us_old.csv


                                                                                

2017_2_news_us_old.csv
2017_3_news_us_old.csv


                                                                                

2017_4_news_us_old.csv


                                                                                

2017_5_news_us_old.csv


                                                                                

2017_6_news_us_old.csv
2017_7_news_us_old.csv


                                                                                

2017_8_news_us_old.csv
2017_9_news_us_old.csv
2018_10_news_us_old.csv
2018_11_news_us_old.csv
2018_12_news_us_old.csv
2018_1_news_us_old.csv
2018_2_news_us_old.csv
2018_3_news_us_old.csv
2018_4_news_us_old.csv
2018_5_news_us_old.csv
2018_6_news_us_old.csv
2018_7_news_us_old.csv
2018_8_news_us_old.csv
2018_9_news_us_old.csv
2019_10_news_us_old.csv
2019_11_news_us_old.csv
2019_12_news_us_old.csv
2019_1_news_us_old.csv
2019_2_news_us_old.csv
2019_3_news_us_old.csv
2019_4_news_us_old.csv
2019_5_news_us_old.csv
2019_6_news_us_old.csv
2019_7_news_us_old.csv
2019_8_news_us_old.csv
2019_9_news_us_old.csv
2020_10_news_us_old.csv
2020_11_news_us_old.csv


                                                                                

2020_12_news_us_old.csv
2020_1_news_us_old.csv
2020_2_news_us_old.csv
2020_3_news_us_old.csv
2020_4_news_us_old.csv
2020_5_news_us_old.csv
2020_6_news_us_old.csv
2020_7_news_us_old.csv
2020_8_news_us_old.csv
2020_9_news_us_old.csv
2021_10_news_us_old.csv
2021_11_news_us_old.csv


                                                                                

2021_12_news_us_old.csv
2021_1_news_us_old.csv
2021_2_news_us_old.csv
2021_3_news_us_old.csv
2021_4_news_us_old.csv
2021_5_news_us_old.csv
2021_6_news_us_old.csv
2021_7_news_us_old.csv
2021_8_news_us_old.csv
2021_9_news_us_old.csv
2022_10_news_us_old.csv
2022_11_news_us_old.csv
2022_12_news_us_old.csv
2022_1_news_us_old.csv
2022_2_news_us_old.csv
2022_3_news_us_old.csv
2022_4_news_us_old.csv
2022_5_news_us_old.csv
2022_6_news_us_old.csv
2022_7_news_us_old.csv
2022_8_news_us_old.csv
2022_9_news_us_old.csv
2023_10_news_us_old.csv
2023_11_news_us_old.csv
2023_12_news_us_old.csv
2023_1_news_us_old.csv
2023_2_news_us_old.csv
2023_3_news_us_old.csv
2023_4_news_us_old.csv
2023_5_news_us_old.csv
2023_6_news_us_old.csv
2023_7_news_us_old.csv
2023_8_news_us_old.csv
2023_9_news_us_old.csv
2025_1_news_us_old.csv
2025_2_news_us_old.csv
2025_3_news_us_old.csv


In [None]:
# Check data old news recompte i schema (timestamptype)
f_old_news = my_spark.read.parquet("/TFM/old_news")
print(f_old_news.count())  # Mostra les primeres files
f_old_news.printSchema()

In [113]:
from pyspark.sql.functions import to_date, col
# revisar dates antigues
df = my_spark.read.option("header", "true").schema(schema_old_news).csv("file:///home/roser/TFM/data/old_news/1992_11_news_us_old.csv")
df = df.withColumn("pubDate", to_date(col("pubDate"), "yyyy-MM-dd"))
print(df.count())
# Filtrar les dates mal formades
df_filtered = df.filter(col("pubDate").isNotNull())
df_filtered.count()

7454


6369

In [None]:
from pyspark.sql import functions as F
df_filtered.show()

df = df.withColumn(
    "pubDate",
    F.when(F.col("pubDate") == "1952-01-01", F.lit("1992-01-01")).otherwise(F.col("pubDate"))
)
# Calcular la data màxima i mínima
min_date = df.agg(F.min("pubDate").alias("min_date")).collect()[0]["min_date"]
max_date = df.agg(F.max("pubDate").alias("max_date")).collect()[0]["max_date"]

# Mostrar els resultats
print(f"Min Date: {min_date}")
print(f"Max Date: {max_date}")

+--------+--------------------+--------------------+----------+
|iso_code|               title|         description|   pubDate|
+--------+--------------------+--------------------+----------+
|      us|Quiet Draw for Fi...|Bobby Fischer tri...|1992-11-01|
|      us| Valvano Back on Air|JIM VALVANO, the ...|1992-11-01|
|      us|Fall Cleanup Day ...|DESPITE the threa...|1992-11-01|
|      us|If You're Thinkin...|PARK SLOPE is fam...|1992-11-01|
|      us|WHITE MEN CAN'T DRUM|What a pleasure t...|1992-11-01|
|      us|IN SHORT: NONFICTION|THE REPORTER WHO ...|1992-11-01|
|      us|Pull Up a Chair, ...|TO: Mayor David N...|1992-11-01|
|      us|Broken Hearts and...|WHERE IS HERE? St...|1992-11-01|
|      us|Who Wants Only 'T...|Recent hand-wring...|1992-11-01|
|      us|   IN SHORT: FICTION|MOSTLY HARMLESS B...|1992-11-01|
|      us|Christy Falcon, B...|Howard J. Falcon ...|1992-11-01|
|      us|   WESTCHESTER GUIDE|      INTO THE WOODS|1992-11-01|
|      us|              INSIDE|     A Fe

In [121]:
df.write.mode("append").parquet("/TFM/old_news/")

In [21]:
df_last_news = my_spark.read.parquet("/TFM/cleaned/news")
df_last_news.show(n=3)
df_last_news.printSchema()
df_dates_range = df_last_news.groupBy("iso_code") \
                   .agg(
                       F.min(F.col("pubDate")).alias("min_date"),
                       F.max(F.col("pubDate")).alias("max_date")
                   )

df_dates_range.show(n=30, truncate=False)

+--------+----------+--------------------+
|iso_code|   pubDate|          words_text|
+--------+----------+--------------------+
|      cz|2025-03-29|early end stunnin...|
|      cz|2025-04-03|prime minister od...|
|      cz|2025-03-23|police floor twen...|
+--------+----------+--------------------+
only showing top 3 rows

root
 |-- iso_code: string (nullable = true)
 |-- pubDate: date (nullable = true)
 |-- words_text: string (nullable = true)

+--------+----------+----------+
|iso_code|min_date  |max_date  |
+--------+----------+----------+
|us      |2025-03-17|2025-04-08|
|ro      |2025-03-18|2025-04-09|
|lv      |2025-03-18|2025-04-09|
|pl      |2025-03-18|2025-04-09|
|sk      |2025-03-18|2025-04-09|
|pt      |2025-03-18|2025-04-09|
|be      |2025-03-18|2025-04-09|
|gb      |2025-03-18|2025-04-09|
|de      |2025-03-18|2025-04-09|
|lu      |2025-03-17|2025-04-09|
|es      |2025-03-18|2025-04-09|
|hr      |2025-03-18|2025-04-09|
|it      |2025-03-18|2025-04-09|
|nl      |2025-03-18|

In [None]:
df_last_news = my_spark.read.parquet("/TFM/news/news_us/*")
df_last_news.show(n=1000)

+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+-------------------+
|            iso_code|               title|         description|     lang_tranlation|    title_translated|description_translated|            pubDate|
+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+-------------------+
|                  us|Rory McIlroy take...|PONTE VEDRA BEACH...|                  en|Rory McIlroy take...|  PONTE VEDRA BEACH...|2025-03-17 15:22:32|
|                  us|At least 40 kille...|                NULL|                  en|At least 40 kille...|                  NULL|2025-03-17 19:28:19|
|                  us|Fast-fashion stap...|F21 OpCo, operato...|                  en|Fast-fashion stap...|  F21 OpCo, operato...|2025-03-17 19:08:45|
|                  us|Rory McIlroy turn...|PONTE VEDRA BEACH...|                  en|Rory McIlroy tu