In [None]:
#Dirección en Kaggle: https://www.kaggle.com/alexvargasvalderrama/proyecto-utec-jm-grupo2-final
!pip install pyspark
!pip install langdetect
!pip install nltk

In [None]:
import pandas as pd
import numpy as np
import os
import plotly.express as px
import warnings 
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
warnings.filterwarnings('ignore')

spark = (
    SparkSession.builder.appName("covid")
    .master("local[*]")
    .config("spark.driver.memory", "16g")
    .config("spark.executor.memory", "16g")
    .config("spark.driver.maxResultSize", "4g")
    .getOrCreate()
)

# Parte 1. Carga y Limpeza de datos
## Por el gran volumen de la data esta parte se realizó en el Khipu

In [None]:
#Cargamos el metadata del dataset CORD-19-research-challenge
df_csv = spark.read\
            .format("csv")\
            .option("header", "true")\
            .load("../input/CORD-19-research-challenge/metadata.csv")

In [None]:
df=df_csv.toPandas() 
df = df[['cord_uid','title','doi','abstract','publish_time','authors','journal','doi','pmcid','pubmed_id','pdf_json_files']]

In [None]:
# Eliminar artículos sin resúmenes
df = df[~df['abstract'].isnull()]

In [None]:
# Reemplazamos las palabras clave de la sección de los abstract
df['abstract'] = df['abstract'].apply(lambda x: 
                                          x.replace('BACKGROUND:','').replace('BACKGROUNDS:','').replace('OBJECTIVES:','')
                                          .replace('OBJECTIVE:','').replace('METHODS:','').replace('METHOD:','')
                                          .replace('RESULTS:','').replace('RESULT:','')
                                          .replace('CONCLUSION:','').replace('CONCLUSIONS:',''))

In [None]:
# Convertimos el abstract a minúsculas
df['abstract'] = df['abstract'].apply(lambda x: x.lower())
# Esto reemplaza las líneas que contienen el texto "this article is protected by copyright. all rights reserved"
df['abstract'] = df['abstract'].apply(lambda x: x.replace('this article is protected by copyright. all rights reserved',''))

In [None]:
# Conversión de la cadena timestamp a formato de fecha, que Python puede procesar
df['publish_time_new'] = pd.to_datetime(df['publish_time'], format='%Y-%m-%d',errors='coerce')

In [None]:
# Removemos los artículos que fueron publicados antes del 01/01/2020
import datetime
df= df[df['publish_time_new']>'2020-01-01']

In [None]:
# Detectamos y removemos artículos con abstracts escritos en otro idioma que no sea inglés
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0
def langdet (x):
    try:
        return detect(x)
    except:
        return "NA"
df['lang'] = df['abstract'].apply(lambda x: langdet(x))
df = df[df['lang'].str.contains('en')]

In [None]:
#Convertimos el abstract limpio a tokens y encontramos los unigramas y bigramas
import re
import nltk
import string
from textblob import TextBlob
stopword = nltk.corpus.stopwords.words('english')
my_file = open("/kaggle/input/stopword/stopwords.txt", "r")
content = my_file.read().split('\n')
stopword.extend(content)
stopword = list(set(stopword))
stopword = [w.strip() for w in stopword]
stopword = set(stopword)
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()
from nltk import bigrams

def tokenization(text):
    text = text.split()
    text = ','.join(set(text))
    return text
def clean_text(text):
    text_lc = " ".join([word.lower() for word in text.split() if word not in string.punctuation]) 
    text_rc = re.sub('[0-9]+', '', text_lc)
    tokens = re.split('\W+', text_rc)   
    text = [word for word in tokens if word not in stopword]  
    text = ' '.join(text)
    return text
df['title'] = df['title'].apply(str)
df['title'] = df['title'].apply(lambda x: x.lower())
df['clean_text'] = df['abstract'].apply(lambda x: clean_text(x))
df['unigram'] = df['clean_text'].apply(lambda x: tokenization(x))
df['bigram']  = df['unigram'].apply(lambda x: ','.join([st[0].strip()+" "+st[1].strip() for st in list(bigrams(x.split(',')))]))

In [None]:
#Buscamos los términos que hacen referencia al coronavirus
df = df[((df['abstract'].str.contains('coronavirus|covid|2019-ncov|sars-cov'))|
         (df['title'].str.contains('coronavirus|covid|2019-ncov|sars-cov')))]

In [None]:
#Buscamos los términos que hacen referencia a nuestro tema: Secuelas de Covid
df = df[((df['abstract'].str.contains('post-acute COVID-19 syndrome|complications|sequelae|consequence|hauler|long-term |chronic'))|
         (df['title'].str.contains('post-acute COVID-19 syndrome|complications|sequelae|consequence|hauler|long-term |chronic')))]


In [None]:
# Eliminamos duplicados basados en el mismo título
has_dup = df.duplicated(subset ="title", keep=False)
dup = df[has_dup]
df = df[~has_dup]
dup = dup.fillna('-999')

In [None]:
# Llenamos los datos faltantes con -999 para facilitar la búsqueda
dup1 = dup[~((dup['journal'].str.contains('-999'))|((dup['pmcid'].str.contains('-999')))|((dup['pubmed_id'].astype(str).str.contains('-999')))|((dup['doi'].str.contains('-999'))))]
dup1 = dup1.drop_duplicates(subset ="title", keep='first')

In [None]:
#Combinamos los artículos únicos de estrads duplicadas y artículos no duplicados
df1 = pd.concat([df,dup1])

In [None]:
#Finalmente guardamos la data limpia en un archivo separado
df1 = df1.reset_index()
df1 = df1.drop(['doi.1'], axis=1)
df1.to_csv('Selected_articles_clean_text_eng_duplicate_removed_01.csv', index=None)

# Parte 2: Aplicación de Machine Learning para agruparlos por clusters

In [None]:
#Cargamos la data limpia
df = pd.read_csv('../input/data-procesada/Selected_articles_clean_text_eng_duplicate_removed_01.csv').fillna('')

In [None]:
# Para la ingeniería de variables usamos TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import re
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
def tokenize_and_stem(text):
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

#Definimos los parametros del TfidfVectorizer
# Parametros: max_df=0.90 and min_df=10
tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=10, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,2))


%time tfidf_matrix = tfidf_vectorizer.fit_transform(df['clean_text'].tolist()) 

print(tfidf_matrix.shape)

In [None]:
# Buscamos el posible número de clusters usando el método Elbow
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from yellowbrick.cluster import KElbowVisualizer
from sklearn.decomposition import TruncatedSVD

pca = TruncatedSVD(n_components=200)
X = pca.fit_transform(tfidf_matrix)


# Iniciamos el modelo de clusters y visualizamos
model = KMeans()
visualizer = KElbowVisualizer(model, k=(2,50))

visualizer.fit(X)        # Encaja el modelo en el visualizador
visualizer.show()        # Genera la figura y para guardar la figura usamos: outpath="kelbow_kmeans.png" 

In [None]:
# Usamos 21 como input del KMeans clustering como sugerencia del método Elbow que indica 21 clusters
X = tfidf_matrix
from sklearn.cluster import KMeans
km = KMeans(n_clusters = 21, init = 'k-means++', random_state = 0)
km.fit(X)
predict = km.predict(X)

In [None]:
#Asignamos el nro. de cluster al abstract
df['cluster'] = pd.Series(predict, index = df.index)

In [None]:
#Grabamos en un archivo el resultado final con la asignación del número de cluster
df.to_csv('kmeans_resultados.csv',sep="|",index=None)

In [None]:
# Extraemos los top de unigramas y bigramas de cada cluster
# para que sea evaluado por los expertos y validar las etiquetas de cada cluster
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import math
clusters = []
for i in range(21):
#   unigramas  
    print('Topic '+str(i)+' :')
    terms = df[df['cluster']==i]['unigram'].tolist()
    stats = 'Topic '+str(i)+' stats: '+ str(len(terms)/len(df)*100)+"% ("+str(len(terms))+"/"+str(len(df))+")"
    print(stats)
    print(i)
    cnt = Counter([x.strip() for st in terms for x in st.split(',')])
    del cnt['']
    counter = cnt.most_common(100)
    uni = ', '.join([val[0] for val in counter])
    print('Topic'+str(i)+' top unigrams : '+uni)
    print('\n')
    
#   bigramas
    terms = df[df['cluster']==i]['bigram'].tolist()
    cnt = Counter([x.strip() for st in terms for x in st.split(',')])
    del cnt['']
    counter = cnt.most_common(100)
    bi = ', '.join([val[0] for val in counter])
    print('Topic'+str(i)+' top bigrams : '+bi)
    print('\n')
    
    clusters.append([stats,uni,bi])
    
    print('\n\n\n\n\n')

In [None]:
# Grabamos un archivo con el top 100 de los unigramas y bigramas para el analisis de expertos 
cluster_results = pd.DataFrame(clusters, columns=['Stats','Top100Unigrams','Top100Bigrams'])
cluster_results['ClusterNumber'] = cluster_results.reset_index().index
cluster_results= cluster_results[['ClusterNumber','Stats','Top100Unigrams','Top100Bigrams']]
cluster_results.to_csv('top_terms_in_clusters_new.csv',index=None)

In [None]:
#Luego del analisis de expertos, se procede a unir algunos clusters
# Para ello cargamos el archivo con los cluster actuales
df = pd.read_csv('../input/dataresultado/kmeans_resultados.csv',sep="|").fillna('')

In [None]:
#Luego del análisis de expertos, se procede a unir algunos clusters
#Definimos la función que cambia los cluster luego del análisis de expertos y luego la ejecutamos

def reclasificar(x):
    if x==19:
        return 4
    elif x==15:
        return 5
    elif x==13:
        return 11
    elif x==18:
        return 12
    elif x==14 or x==17:
        return 13
    elif x==16:
        return 14
    elif x==20:
        return 15
    else:
        return x
df['cluster'] = df['cluster'].apply(lambda x: reclasificar(x))

In [None]:
#Grabamos en un archivo el resultado final con la asignación del número de cluster actualizado luego
# del análisis de expertos
df.to_csv('kmeans_resultados_final.csv',sep="|",index=None)

In [None]:
# Extraemos la versión final de los top de unigramas y bigramas de cada cluster
# luego del análisis de los expertos
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import math
clusters = []
for i in range(16):
#   unigramas  
    print('Topic '+str(i)+' :')
    terms = df[df['cluster']==i]['unigram'].tolist()
    stats = 'Topic '+str(i)+' stats: '+ str(len(terms)/len(df)*100)+"% ("+str(len(terms))+"/"+str(len(df))+")"
    print(stats)
    print(i)
    cnt = Counter([x.strip() for st in terms for x in st.split(',')])
    del cnt['']
    counter = cnt.most_common(100)
    uni = ', '.join([val[0] for val in counter])
    print('Topic'+str(i)+' top unigrams : '+uni)
    print('\n')
    
#   bigramas
    terms = df[df['cluster']==i]['bigram'].tolist()
    cnt = Counter([x.strip() for st in terms for x in st.split(',')])
    del cnt['']
    counter = cnt.most_common(100)
    bi = ', '.join([val[0] for val in counter])
    print('Topic'+str(i)+' top bigrams : '+bi)
    print('\n')
    
    clusters.append([stats,uni,bi])
    
    print('\n\n\n\n\n')

In [None]:
# Grabamos un archivo con el top 100 de los unigramas y bigramas luego del análisis de expertos 
cluster_results = pd.DataFrame(clusters, columns=['Stats','Top100Unigrams','Top100Bigrams'])
cluster_results['ClusterNumber'] = cluster_results.reset_index().index
cluster_results= cluster_results[['ClusterNumber','Stats','Top100Unigrams','Top100Bigrams']]
cluster_results.to_csv('top_terms_in_clusters_new_final.csv',index=None)

# Parte 3: Análisis descriptivo con SPARK SQL

In [None]:
from pyspark.sql.functions import isnan, when, count, col, year, month, to_date

In [None]:
# Cargo los datos reultantes del punto anterior
df_consultas = spark.read\
            .format("csv")\
            .option("header", "true")\
            .option("sep","|")\
            .load("../input/data-resultado-final/kmeans_resultados_final.csv")

In [None]:
#Agregamos la columna country para coloar el pais del autor del artículo
df_consultas = df_consultas.withColumn("country", lit(""))
df_consultas_j=df_consultas.toPandas() 

In [None]:
#Como el país del autor del artículo no se encuentra en el metadata se está buscando
#en el archivo json a fin, en caso existiese, identificar el país del autor, en caso de ser varios
#se toma el que aparece en primer orden
import json
for s in range(len(df_consultas_j)):
    if(df_consultas_j['pdf_json_files'][s]==None):
        df_consultas_j['country'][s]=''
    else:
        try:
            with open('../input/CORD-19-research-challenge/'+df_consultas_j['pdf_json_files'][s],'r') as miarchivo:
                datos=miarchivo.read()
            objeto=json.loads(datos)
            if(len(objeto['metadata']['authors'])==0):
               df_consultas_j['country'][s]=''
            for p in range(len(objeto['metadata']['authors'])):
                try:
                    if objeto['metadata']['authors'][p]['affiliation']['location']['country'].strip()!='':
                        df_consultas_j['country'][s]=objeto['metadata']['authors'][p]['affiliation']['location']['country'].strip()
                        p=len(objeto['metadata']['authors'])
                except:
                    p=p+1;
        except:
            df_consultas_j['country'][s]='';      

In [None]:
df_consultas=spark.createDataFrame(df_consultas_j) 

In [None]:
# Agregamos las colulmas publish_Year (Año de publicación), publish_Month (Mes de publicación) y
# publish_Week (Semana de publicación)
df_consultas = df_consultas.withColumn("publish_Year", year(to_date("publish_time")))\
                        .withColumn("publish_Month", month(to_date("publish_time")))\
                        .withColumn("publish_Week", weekofyear(to_date("publish_time")))\
                        .withColumn("ones", lit(1)).createOrReplaceTempView("consultas")

In [None]:
# Reporte por tipos de publicaciones
# Como revisión sistemática, revisión de alcance, etc.
spark.sql("select 'Systematic review' Tipo_de_Publicaciones,count(*) Cantidad from consultas where "\
"abstract like '%systematic review%' or abstract like '%systematic literature review%'"\
" or title like '%systematic review%' or title like '%systematic literature review%' group by 'Systematic review'"\
" union all "\
"select 'Meta-analysis' Tipo_de_Publicaciones,count(*) Cantidad from consultas where "\
"abstract like '%meta-analysis%' or abstract like '%metaanalysis%'"\
" or title like '%meta-analysis%' or title like '%metaanalysis%' group by 'Meta-analysis'"\
" union all "\
"select 'Scoping review' Tipo_de_Publicaciones,count(*) Cantidad from consultas where "\
"abstract like '%scoping review%' or abstract like '%scoping literature review%'"\
"or title like '%scoping review%' or title like '%scoping literature review%' group by 'Scoping review'"\
" union all "\
"select 'Randomised control trial' Tipo_de_Publicaciones,count(*) Cantidad from consultas where "\
"abstract like '%randomised control trial%' or abstract like '%randomized control trial%'"\
" or abstract like '%randomised controlled trial%' or abstract like '%randomized controlled trial%'"\
" or abstract like '%randomized clinical trial%' or abstract like '%randomised clinical trial%'"\
" or title like '%randomised control trial%' or title like '%randomized control trial%'"\
" or title like '%randomised controlled trial%' or title like '%randomized controlled trial%'"\
" or title like '%randomized clinical trial%' or title like '%randomised clinical trial%' group by 'Randomised control trial'"\
" union all "\
"select 'Survey' Tipo_de_Publicaciones,count(*) Cantidad from consultas where "\
"abstract like '%survey%'"\
" or title like '%survey%' group by 'Survey'"\
" union all "\
"select 'Case-control study' Tipo_de_Publicaciones,count(*) Cantidad from consultas where "\
"abstract like '%case-control study%' or abstract like '%case control study%'"\
" or title like '%case-control study%' or title like '%case control study%' group by 'Case-control study'"\
" union all "\
"select 'Cohort study' Tipo_de_Publicaciones,count(*) Cantidad from consultas where "\
"abstract like '%cohort study%'"\
" or title like '%cohort study%' group by 'Cohort study'"\
" union all "\
"select 'Case study' Tipo_de_Publicaciones,count(*) Cantidad from consultas where "\
"abstract like '%case study%'"\
" or title like '%case study%' group by 'Case study'"\
" order by Cantidad desc").show() 

#Guardamos el reporte en csv
spark.sql("select 'Systematic review' Tipo_de_Publicaciones,count(*) Cantidad from consultas where "\
"abstract like '%systematic review%' or abstract like '%systematic literature review%'"\
" or title like '%systematic review%' or title like '%systematic literature review%' group by 'Systematic review'"\
" union all "\
"select 'Meta-analysis' Tipo_de_Publicaciones,count(*) Cantidad from consultas where "\
"abstract like '%meta-analysis%' or abstract like '%metaanalysis%'"\
" or title like '%meta-analysis%' or title like '%metaanalysis%' group by 'Meta-analysis'"\
" union all "\
"select 'Scoping review' Tipo_de_Publicaciones,count(*) Cantidad from consultas where "\
"abstract like '%scoping review%' or abstract like '%scoping literature review%'"\
"or title like '%scoping review%' or title like '%scoping literature review%' group by 'Scoping review'"\
" union all "\
"select 'Randomised control trial' Tipo_de_Publicaciones,count(*) Cantidad from consultas where "\
"abstract like '%randomised control trial%' or abstract like '%randomized control trial%'"\
" or abstract like '%randomised controlled trial%' or abstract like '%randomized controlled trial%'"\
" or abstract like '%randomized clinical trial%' or abstract like '%randomised clinical trial%'"\
" or title like '%randomised control trial%' or title like '%randomized control trial%'"\
" or title like '%randomised controlled trial%' or title like '%randomized controlled trial%'"\
" or title like '%randomized clinical trial%' or title like '%randomised clinical trial%' group by 'Randomised control trial'"\
" union all "\
"select 'Survey' Tipo_de_Publicaciones,count(*) Cantidad from consultas where "\
"abstract like '%survey%'"\
" or title like '%survey%' group by 'Survey'"\
" union all "\
"select 'Case-control study' Tipo_de_Publicaciones,count(*) Cantidad from consultas where "\
"abstract like '%case-control study%' or abstract like '%case control study%'"\
" or title like '%case-control study%' or title like '%case control study%' group by 'Case-control study'"\
" union all "\
"select 'Cohort study' Tipo_de_Publicaciones,count(*) Cantidad from consultas where "\
"abstract like '%cohort study%'"\
" or title like '%cohort study%' group by 'Cohort study'"\
" union all "\
"select 'Case study' Tipo_de_Publicaciones,count(*) Cantidad from consultas where "\
"abstract like '%case study%'"\
" or title like '%case study%' group by 'Case study'"\
" order by Cantidad desc").toPandas().to_csv('reporte_tipo_publicaciones.csv',sep="|",index=None)

In [None]:
#Reporte estadístico agrupado por día de publicación
spark.sql("Select publish_time_new,count(ones) cantidad,mean(ones) media,"\
          "stddev(ones) std,min(ones) min,percentile(ones,0.25) Q1_25,"\
          "percentile(ones,0.5) Q2_50,percentile(ones,0.75) Q3_75,max(ones) max,"\
          "sum(ones) suma from consultas group by publish_time_new order by publish_time_new").show()
#Guardamos el reporte en CSV
spark.sql("Select publish_time_new,count(ones) cantidad,mean(ones) media,"\
          "stddev(ones) std,min(ones) min,percentile(ones,0.25) Q1_25,"\
          "percentile(ones,0.5) Q2_50,percentile(ones,0.75) Q3_75,max(ones) max,"\
          "sum(ones) suma from consultas group by publish_time_new order by publish_time_new").toPandas().to_csv('reporte_dia_publicacion.csv',sep="|",index=None)

In [None]:
#Reporte estadístico agrupado por semana de publicación
spark.sql("Select publish_Year,publish_Week,sum(ones) cantidad,round(mean(ones),2) media,"\
          "round(stddev(ones),2) std,min(ones) min,round(percentile(ones,0.25),2) Q1_25,"\
          "round(percentile(ones,0.5),2) Q2_50,round(percentile(ones,0.75),2) Q3_75,max(ones) max,"\
          "sum(ones) suma from (Select publish_time_new,publish_Year,publish_Week,count(ones) "\
          "ones from consultas group by publish_time_new,publish_Year,publish_Week) "\
          "group by publish_Year,publish_Week order by publish_Year,int(publish_Week)").show()
#Guardamos el reporte en CSV
spark.sql("Select publish_Year,publish_Week,sum(ones) cantidad,round(mean(ones),2) media,"\
          "round(stddev(ones),2) std,min(ones) min,round(percentile(ones,0.25),2) Q1_25,"\
          "round(percentile(ones,0.5),2) Q2_50,round(percentile(ones,0.75),2) Q3_75,max(ones) max,"\
          "sum(ones) suma from (Select publish_time_new,publish_Year,publish_Week,count(ones) "\
          "ones from consultas group by publish_time_new,publish_Year,publish_Week) "\
          "group by publish_Year,publish_Week order by publish_Year,int(publish_Week)").toPandas().to_csv('reporte_semana_publicacion.csv',sep="|",index=None)

In [None]:
#Reporte estadístico agrupado por mes de publicación
spark.sql("Select publish_Year,publish_Month,sum(ones) cantidad,round(mean(ones),2) media,"\
          "round(stddev(ones),2) std,min(ones) min,round(percentile(ones,0.25),2) Q1_25,"\
          "round(percentile(ones,0.5),2) Q2_50,round(percentile(ones,0.75),2) Q3_75,max(ones) max,"\
          "sum(ones) suma from (Select publish_time_new,publish_Year,publish_Month,count(ones) "\
          "ones from consultas group by publish_time_new,publish_Year,publish_Month) "\
          "group by publish_Year,publish_Month order by publish_Year,int(publish_Month)").show()
#Guardamos el reporte en CSV
spark.sql("Select publish_Year,publish_Month,sum(ones) cantidad,round(mean(ones),2) media,"\
          "round(stddev(ones),2) std,min(ones) min,round(percentile(ones,0.25),2) Q1_25,"\
          "round(percentile(ones,0.5),2) Q2_50,round(percentile(ones,0.75),2) Q3_75,max(ones) max,"\
          "sum(ones) suma from (Select publish_time_new,publish_Year,publish_Month,count(ones) "\
          "ones from consultas group by publish_time_new,publish_Year,publish_Month) "\
          "group by publish_Year,publish_Month order by publish_Year,int(publish_Month)").toPandas().to_csv('reporte_mes_publicacion.csv',sep="|",index=None)

In [None]:
#Reporte estadístico agrupado por año de publicación
spark.sql("Select publish_Year,sum(ones) cantidad,round(mean(ones),2) media,"\
          "round(stddev(ones),2) std,min(ones) min,round(percentile(ones,0.25),2) Q1_25,"\
          "round(percentile(ones,0.5),2) Q2_50,round(percentile(ones,0.75),2) Q3_75,max(ones) max,"\
          "sum(ones) suma from (Select publish_time_new,publish_Year,count(ones) "\
          "ones from consultas group by publish_time_new,publish_Year) "\
          "group by publish_Year order by publish_Year,int(publish_Year)").show()
#Guardamos el reporte en CSV
spark.sql("Select publish_Year,sum(ones) cantidad,round(mean(ones),2) media,"\
          "round(stddev(ones),2) std,min(ones) min,round(percentile(ones,0.25),2) Q1_25,"\
          "round(percentile(ones,0.5),2) Q2_50,round(percentile(ones,0.75),2) Q3_75,max(ones) max,"\
          "sum(ones) suma from (Select publish_time_new,publish_Year,count(ones) "\
          "ones from consultas group by publish_time_new,publish_Year) "\
          "group by publish_Year order by publish_Year,int(publish_Year)").toPandas().to_csv('reporte_anio_publicacion.csv',sep="|",index=None)

In [None]:
#Identificamos la cantidad de cluster
cantidad_cluster=spark.sql("Select max(int(cluster)) cantidad from consultas").toPandas()

In [None]:
#Reporte estadístico agrupado por mes y clasificado por cluster
for i in range(cantidad_cluster['cantidad'][0]+1):
    print("Cluster: "+str(i)+"\n")
    spark.sql("Select publish_Year,publish_Month,sum(ones) cantidad,round(mean(ones),2) media,"\
          "round(stddev(ones),2) std,min(ones) min,round(percentile(ones,0.25),2) Q1_25,"\
          "round(percentile(ones,0.5),2) Q2_50,round(percentile(ones,0.75),2) Q3_75,max(ones) max,"\
          "sum(ones) suma from (Select publish_time_new,publish_Year,publish_Month,count(ones) "\
          "ones from consultas where cluster="+str(i)+" group by publish_time_new,publish_Year,publish_Month) "\
          "group by publish_Year,publish_Month order by publish_Year,int(publish_Month)").show()
#Guardamos el reporte en CSV
spark.sql("Select cluster,publish_Year,publish_Month,sum(ones) cantidad,round(mean(ones),2) media,"\
          "round(stddev(ones),2) std,min(ones) min,round(percentile(ones,0.25),2) Q1_25,"\
          "round(percentile(ones,0.5),2) Q2_50,round(percentile(ones,0.75),2) Q3_75,max(ones) max,"\
          "sum(ones) suma from (Select publish_time_new,cluster,publish_Year,publish_Month,count(ones) "\
          "ones from consultas group by publish_time_new,cluster,publish_Year,publish_Month) "\
          "group by cluster,publish_Year,publish_Month order by cluster,publish_Year,int(publish_Month)").toPandas().to_csv('reporte_cluster_publicacion.csv',sep="|",index=None)

In [None]:
autor=spark.sql("Select authors from consultas where authors is not null").toPandas()

In [None]:
#identificamos los autores de los artículos, pueden ser varios autores para un solo artículo
lista_autor=[]
for a in autor['authors'].tolist():
    for i in a.split(sep=';'):
        lista_autor.append(i.strip())
df_autor=pd.DataFrame(lista_autor,columns=["autor"])
sp_autor=spark.createDataFrame(df_autor) 
sp_autor.createOrReplaceTempView("consulta_autor")

In [None]:
# Reportes por journal y autor, top 10
spark.sql("Select count(distinct journal) total_journal from consultas").show()
spark.sql("Select journal,count(*) cantidad from consultas where journal is not null group by journal order by count(*) desc limit 10").show()
spark.sql("Select count(distinct autor) total_autores from consulta_autor").show()
spark.sql("Select autor,count(*) cantidad from consulta_autor group by autor order by count(*) desc limit 10").show()
#Guardamos el reporte en CSV
spark.sql("Select journal,count(*) cantidad from consultas where journal is not null group by journal order by count(*) desc").toPandas().to_csv('reporte_journal.csv',sep="|",index=None)
spark.sql("Select autor,count(*) cantidad from consulta_autor group by autor order by count(*) desc").toPandas().to_csv('reporte_autor.csv',sep="|",index=None)

In [None]:
#Reporte de autores clasificado por cluster, top 10 y lo grabamos en un archivo csv
for i in range(cantidad_cluster['cantidad'][0]+1):
    print("Cluster: "+str(i)+"\n")
    autor=spark.sql("Select authors from consultas where cluster="+str(i)+" and authors is not null").toPandas()
    lista_autor=[]
    for a in autor['authors'].tolist():
        for j in a.split(sep=';'):
            lista_autor.append(j.strip())
    df_autor=pd.DataFrame(lista_autor,columns=["autor"])    
    sp_autor=spark.createDataFrame(df_autor) 
    sp_autor.createOrReplaceTempView("consulta_autor")
    spark.sql("Select journal,count(*) cantidad from consultas where cluster="+str(i)+" and journal is not null group by journal order by count(*) desc limit 10").show()
    spark.sql("Select autor,count(*) cantidad from consulta_autor group by autor order by count(*) desc limit 10").show()   
    spark.sql("Select autor,count(*) cantidad from consulta_autor group by autor order by count(*) desc limit 10").toPandas().to_csv('reporte_autor_cluster_'+str(i)+'.csv',sep="|",index=None)
spark.sql("Select cluster,journal,count(*) cantidad from consultas where journal is not null group by cluster,journal order by count(*) desc").toPandas().to_csv('reporte_journal_por_cluster.csv',sep="|",index=None)

In [None]:
#Reporte por país de autor de artículo
spark.sql("Select country,count(*) cantidad from consultas where country <>'No se encontro' "\
          "and country <>'' group by country order by count(*) desc limit 10").show()
#Guardamos el reporte en CSV
spark.sql("Select country,count(*) cantidad from consultas "\
          "group by country order by count(*) desc").toPandas().to_csv('reporte_paises.csv',sep="|",index=None)