# Bài tập Dữ liệu lớn

### Giảng viên hướng dẫn
PGS.TS Thoại Nam


### Thành viên nhóm:
+ Trần Thế Huy (1770021)
+ Trần Minh Quốc (1870322)
+ Lê Văn Duẫn (1870387)

## Tổng quan

Khi tình hình dịch bệnh Covid-19 diễn biến phức tạp, số lượng tài liệu nghiên cứu về dịch bệnh cũng ngày càng nhiều, gây khó khăn cho việc tìm kiếm thông tin. Gom nhóm các tài liệu có liên quan nhằm giúp các nhà nghiên cứu dễ dàng truy xuất thông tin, tránh lãng phí thời gian công sức trở nên vô cùng cần thiết.

Phương pháp giải quyết bài toán của nhóm sử dụng các thuật toán chính:
+ Tiền xử lý dữ liệu bằng cách loại bỏ stopwords
+ Vector hóa tài liệu bằng TF-IDF
+ Giảm số chiều feature với PCA
+ Gom cụm tài liệu với K-Means

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import json
import glob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt

# Read Meta

In [None]:
meta_df = pd.read_csv('../input/CORD-19-research-challenge/metadata.csv', dtype={
    'pubmed_id': str,
    'Microsoft Academic Paper ID': str, 
    'doi': str
})
meta_df.info()

## Read Json
+ Concatenate text segments in each json into a long string
+ Extract features


In [None]:
all_json = glob.glob('../input/CORD-19-research-challenge/document_parses/pdf_json/*.json', recursive=True)
len(all_json)

In [None]:
class FileReader:
    def __init__(self, path):
        with open(path) as file:
            content = json.load(file)
            self.paper_id = content['paper_id']
            self.abstract = '\n'.join([part['text'] for part in content['abstract']])
            self.body_text = '\n'.join([part['text'] for part in content['body_text']])
            
    def __repr__(self):
        return f'{self.paper_id}: {self.abstract[:200]}... {self.body_text[:200]}...'
    

FileReader(all_json[0])

In [None]:
dict_ = {'paper_id': [], 'abstract': [], 'body_text': []}

all_json = all_json[:20000]
for idx, entry in enumerate(all_json):
    if idx % (len(all_json) // 10) == 0:
        print(f'Processing index: {idx} of {len(all_json)}')
    
    try:
        content = FileReader(entry)
    except Exception as e:
        continue  # invalid paper format, skip
        
    dict_['abstract'].append(content.abstract)
    dict_['paper_id'].append(content.paper_id)
    dict_['body_text'].append(content.body_text)
    
# df_covid = pd.DataFrame(dict_, columns=['paper_id', 'doi', 'abstract', 'body_text', 'authors', 'title', 'journal', 'abstract_summary'])
df_covid = pd.DataFrame(dict_, columns=['paper_id', 'abstract', 'body_text'])
df_covid.head()

## Remove duplicates


In [None]:
df_covid['abstract_word_count'] = df_covid['abstract'].apply(lambda x: len(x.strip().split()))  # word count in abstract
df_covid['body_word_count'] = df_covid['body_text'].apply(lambda x: len(x.strip().split()))  # word count in body
df_covid['body_unique_words']=df_covid['body_text'].apply(lambda x:len(set(str(x).split())))  # number of unique words in body
df_covid.head()

In [None]:
df_covid.info()

In [None]:
df_covid['abstract'].describe(include='all')

In [None]:
df_covid.drop_duplicates(['abstract', 'body_text'], inplace=True)
df_covid['abstract'].describe(include='all')

In [None]:
df_covid['body_text'].describe(include='all')

## Drop nulls

In [None]:
df_covid.describe()


In [None]:
df = df_covid.sample(10000, random_state=0)
df.info()
del df_covid

In [None]:
df.dropna(inplace=True)
df.info()

## Language Detection

Remove documents that are not in English


In [None]:
from IPython.utils import io
with io.capture_output() as captured:
    !pip install langdetect

In [None]:
from tqdm import tqdm
from langdetect import detect
from langdetect import DetectorFactory

# set seed
DetectorFactory.seed = 0

# hold label - language
languages = []

# go through each text
for ii in tqdm(range(0,len(df))):
    # split by space into list, take the first x intex, join with space
    text = df.iloc[ii]['body_text'].split(" ")
    
    lang = "en"
    try:
        if len(text) > 50:
            lang = detect(" ".join(text[:50]))
        elif len(text) > 0:
            lang = detect(" ".join(text[:len(text)]))
    # ught... beginning of the document was not in a good format
    except Exception as e:
        all_words = set(text)
        try:
            lang = detect(" ".join(all_words))
        # what!! :( let's see if we can find any text in abstract...
        except Exception as e:
            
            try:
                # let's try to label it through the abstract then
                lang = detect(df.iloc[ii]['abstract_summary'])
            except Exception as e:
                lang = "unknown"
                pass
    
    # get the language    
    languages.append(lang)

In [None]:
from pprint import pprint

languages_dict = {}
for lang in set(languages):
    languages_dict[lang] = languages.count(lang)
    
print("Total: {}\n".format(len(languages)))
pprint(languages_dict)

In [None]:
df['language'] = languages
plt.bar(range(len(languages_dict)), list(languages_dict.values()), align='center')
plt.xticks(range(len(languages_dict)), list(languages_dict.keys()))
plt.title("Distribution of Languages in Dataset")
plt.show()

In [None]:
df = df[df['language'] == 'en'] 
df.info()

## Remove stopwords



In [None]:
# Download the spacy bio parser

from IPython.utils import io
with io.capture_output() as captured:
    !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz

In [None]:
#NLP 
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import en_core_sci_lg  # model downloaded in previous step

In [None]:
import string

punctuations = string.punctuation
stopwords = list(STOP_WORDS)
stopwords[:10]

In [None]:
custom_stop_words = [
    'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure', 
    'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 
    'al.', 'Elsevier', 'PMC', 'CZI', 'www'
]

for w in custom_stop_words:
    if w not in stopwords:
        stopwords.append(w)

In [None]:
# Parser
parser = en_core_sci_lg.load(disable=["tagger", "ner"])
parser.max_length = 7000000

def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens

In [None]:
tqdm.pandas()
df["processed_text"] = df["body_text"].progress_apply(spacy_tokenizer)

In [None]:
import seaborn as sns
sns.distplot(df['body_word_count'])
df['body_word_count'].describe()

In [None]:
sns.distplot(df['body_unique_words'])
df['body_unique_words'].describe()

In [None]:
# df["processed_text"] = df["body_text"]

## Vectorization

Use TF-IDF to turn a documents into a vector of importance of words

In [None]:
def vectorize(text, maxx_features):
    vectorizer = TfidfVectorizer(max_features=maxx_features)
    X = vectorizer.fit_transform(text)
    return X


In [None]:
text = df['processed_text'].values
X = vectorize(text, 2 ** 12)
X.shape

## PCA and K-Means


In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95, random_state=42)
X_reduced= pca.fit_transform(X.toarray())
X_reduced.shape

In [None]:
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.spatial.distance import cdist

# run kmeans with many different k
distortions = []
K = range(2, 50)
for k in K:
    k_means = KMeans(n_clusters=k, random_state=42).fit(X_reduced)
    k_means.fit(X_reduced)
    distortions.append(sum(np.min(cdist(X_reduced, k_means.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
    #print('Found distortion for {} clusters'.format(k))

In [None]:
X_line = [K[0], K[-1]]
Y_line = [distortions[0], distortions[-1]]

# Plot the elbow
plt.plot(K, distortions, 'b-')
plt.plot(X_line, Y_line, 'r')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()