In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
#load the data
df=pd.read_csv('/kaggle/input/CORD-19-research-challenge/metadata.csv')

# EXPLORE AND MANIPULATE THE DATA

In [None]:
#first five rows
df.head()

In [None]:
#dataframe info
df.info()

In [None]:
#Dataframe describe
df.describe()

In [None]:
#create a new dataframe with most important columns for us
df=df[['publish_time','authors','title','abstract']]
df.head()

In [None]:
df.info()

In [None]:
#check the total null cell for the column of abstract
df['abstract'].isnull().sum()

In [None]:
#delete rows where  abstract are null
df.dropna(subset=['abstract'], inplace=True)
df.info()

Explore the text in the colunm abstract

In [None]:
#Fetch word count for each abstract
df['word_count'] = df['abstract'].apply(lambda x: len(str(x).split(" ")))
df.head()

In [None]:
#Descriptive statistics of word counts
df.describe()

In [None]:
#Identify common words (20 top words)
freq = pd.Series(' '.join(df['abstract']).split()).value_counts()[:20]
freq

In [None]:
#plot the most 20 common words
freq.plot()

In [None]:
#Identify uncommon words (top 20)
freq1 =  pd.Series(' '.join(df['abstract']).split()).value_counts()[-20:]
freq1

# TEXT PRE-PROCESSING
     # Steps:
- Text clean up
- Shrinking the vocabulary to retain only the relevant/important words
- Reduce sparsity  

**Normalize the data**: stemming and lemmatization

In [None]:
#Import the required libraries for the text processing
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet') 
from nltk.stem.wordnet import WordNetLemmatizer

In [None]:
#Removing stopwords
    ##Creating a list of stop words and adding custom stopwords
stop_words = set(stopwords.words("english"))

    ##Creating a list of custom stopwords (all other words you want to remove from the text)
new_words = ["using", "show", "result", "also", "iv", "one", 'however',"two", "new", "previously", "shown"]
stop_words = stop_words.union(new_words)

In [None]:
#carry out the pre-processing tasks step-by-step to get a cleaned and normalised text corpus:
corpus = []
for i in list(df.index.values): # list of index of the dataframe [0,1,2......]'
    #Remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', df['abstract'][i])
    #Convert to lowercase
    text = text.lower()
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    #remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    #Convert to list from string
    text = text.split()
    #Stemming
    ps=PorterStemmer()
    #Lemmatisation
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in text if not word in  
            stop_words] 
    text = " ".join(text)
    corpus.append(text)

In [None]:
#View corpus item
corpus[1000]

Explore and visualize the corpus

In [None]:
#Word cloud: Vizualize the corpus (frequency or the importance of each word)
#from os import path
#from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
%matplotlib inline
%matplotlib inline
wordcloud = WordCloud(
                          background_color='white',
                          stopwords=stop_words,
                          max_words=100,
                          max_font_size=70, 
                          random_state=42
                         ).generate(str(corpus))
print(wordcloud)
fig = plt.figure(1,figsize=(20,10))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
#Identify common words in the corpus  (20 top words)
freq = pd.Series(' '.join(corpus).split()).value_counts()[:20]
freq

In [None]:
#plot the result (top 20 words in the corpus)
#Convert most freq words to dataframe for plotting bar plot
top_words = pd.Series(' '.join(corpus).split()).value_counts()[:20]
top_df = pd.DataFrame(top_words).reset_index()
top_df.columns=["Word", "Freq"]

#Barplot of most freq words
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
g = sns.barplot(x="Word", y="Freq", data=top_df)
g.set_title('Top 20 words in the corpus')
g.set_xticklabels(g.get_xticklabels(), rotation=30)


# TEXT PREPARATION

Text in the corpus needs to be converted to a format that can be interpreted by the machine learning algorithms. There are 2 parts of this conversion — Tokenisation and Vectorisation.

Tokenisation is the process of converting the continuous text into a list of words. The list of words is then converted to a matrix of integers by the process of vectorisation. Vectorisation is also called feature extraction.

For text preparation we use the bag of words model which ignores the sequence of the words and only considers word frequencies.

    -Vectorization
As the first step of conversion, we will use the CountVectoriser to tokenise the text and build a vocabulary of known words. We first create a variable “cv” of the CountVectoriser class, and then evoke the fit_transform function to learn and build the vocabulary.


In [None]:
#Creating a vector of word counts
from sklearn.feature_extraction.text import CountVectorizer
import re
cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3))
X=cv.fit_transform(corpus)

In [None]:
#shape of X
X.shape

In [None]:
#print a list of 10 vocabulary from the list of vocabulary
list(cv.vocabulary_.keys())[:10]

Visualize top N uni-grams, bi-grams, tri-grams and 4-grams

In [None]:
#Uni-grams
    #Most frequently occuring words
def get_top_unigram_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                       reverse=True)
    return words_freq[:n]
    #Convert most freq words to dataframe for plotting bar plot
top_words = get_top_unigram_words(corpus, n=20)
top_df = pd.DataFrame(top_words)
top_df.columns=["Word", "Freq"]
    #Barplot of most freq words
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
g = sns.barplot(x="Word", y="Freq", data=top_df)
g.set_title('Top 20 Uni_grams')
g.set_xticklabels(g.get_xticklabels(), rotation=30)

In [None]:
#Bi_grams
    #Most frequently occuring Bi-grams
def get_top_bi_grams_words(corpus, n=None):
    vec1 = CountVectorizer(ngram_range=(2,2), max_features=4000).fit(corpus)
    bag_of_words = vec1.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                reverse=True)
    return words_freq[:n]
top2_words = get_top_bi_grams_words(corpus, n=20)
top2_df = pd.DataFrame(top2_words)
top2_df.columns=["Bi-gram", "Freq"]

    #Barplot of most freq Bi-grams
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
h=sns.barplot(x="Bi-gram", y="Freq", data=top2_df)
h.set_title('Top 20 Bi_grams')
h.set_xticklabels(h.get_xticklabels(), rotation=45)

In [None]:
#Tri_Grams
    #Most frequently occuring Tri-grams
def get_top_n3_words(corpus, n=None):
    vec1 = CountVectorizer(ngram_range=(3,3), max_features=4000).fit(corpus)
    bag_of_words = vec1.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                reverse=True)
    return words_freq[:n]
top3_words = get_top_n3_words(corpus, n=20)
top3_df = pd.DataFrame(top3_words)
top3_df.columns=["Tri-gram", "Freq"]
print(top3_df)
    #Barplot of most freq Tri-grams
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
j=sns.barplot(x="Tri-gram", y="Freq", data=top3_df)
j.set_title('Top 20 Tri_grams')
j.set_xticklabels(j.get_xticklabels(), rotation=45)

In [None]:
#4_Grams
    #Most frequently occuring 4-grams
def get_top_n4_words(corpus, n=None):
    vec1 = CountVectorizer(ngram_range=(3,3), max_features=4000).fit(corpus)
    bag_of_words = vec1.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                reverse=True)
    return words_freq[:n]
top3_words = get_top_n4_words(corpus, n=20)
top3_df = pd.DataFrame(top3_words)
top3_df.columns=["4-gram", "Freq"]
print(top3_df)
    #Barplot of most freq Tri-grams
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
l=sns.barplot(x="4-gram", y="Freq", data=top3_df)
l.set_title('Top 20 4_grams')
l.set_xticklabels(j.get_xticklabels(), rotation=45)

In [None]:
#Converting to a matrix of integers
from sklearn.feature_extraction.text import TfidfTransformer
 
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(X)

# get feature names
feature_names=cv.get_feature_names()

# **KEYWORDS EXTRACTION FOR EACH ABSTRACT OF THE CORPUS**

In [None]:
# Define Function for sorting tf_idf in descending order

from scipy.sparse import coo_matrix
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

**Example****: Extract the keywords for the abstract number 304 in the corpus

In [None]:
#Extract the keywords for the abstract number 304 (1)
abstract_335=corpus[335]
    #generate tf-idf for the given document
tf_idf_vector_abstract_335=tfidf_transformer.transform(cv.transform([abstract_335]))
#sort the tf-idf vectors by descending order of scores

sorted_items=sort_coo(tf_idf_vector_abstract_335.tocoo())

#extract only the top n; n here is 5
keywords=extract_topn_from_vector(feature_names,sorted_items,5)
    
 
# now print the results
print("\nAbstract 335:")
print(abstract_335)
print("\nKeywords:")
for k in keywords:
    print(k,keywords[k])

In [None]:
#sort the tf-idf vectors by descending order of scores
tf_idf_vector_corpus=tfidf_transformer.transform(cv.transform(corpus))
keywords=[]
for b in tf_idf_vector_corpus:
    sorted_items=sort_coo(b.tocoo())
    keywords.append(extract_topn_from_vector(feature_names,sorted_items,5))

In [None]:
#add the keywords for each abstract in the Dataframe
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)
df['keywords']=keywords
df1=df.drop(columns='word_count', axis=1)
df1.head()

# **CORPUS CLUSTERING**

In [None]:
#Use the the algorith MinisBatch as a Classifier
    #Import the required libraries
import sklearn
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans

In [None]:
#predict the cluster
X1=tf_idf_vector_corpus

#Make the prediction for 10 clusters
k = 10

kmeans = MiniBatchKMeans(n_clusters=k)
y_pred = kmeans.fit_predict(X1)
y=y_pred

    Visualize the clusters
Use the Principal component analysis (PCA) to decompoze the data in project it to a lower dimensional space.

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
pca_result = pca.fit_transform(X1.toarray())

In [None]:
#Vizualize the clusters
# sns settings
sns.set(rc={'figure.figsize':(15,15)})
# colors
palette = sns.color_palette("bright", len(set(y)))
# plot
sns.scatterplot(pca_result[:,0], pca_result[:,1], hue=y, legend='full', palette=palette)
plt.title("Covid-19 Abstracts - Clustered (K-Means)")
plt.show()

In [None]:
#vizualize in 3D
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

ax = plt.figure(figsize=(16,10)).gca(projection='3d')
ax.scatter(
    xs=pca_result[:,0], 
    ys=pca_result[:,1], 
    zs=pca_result[:,2], 
    c=y, 
    cmap='tab10'
)
ax.set_xlabel('PCA_1')
ax.set_ylabel('PCA_2')
ax.set_zlabel('PCA_3')
plt.title("Covid-19 Abstracts - Clustered (K-Means)")
plt.show()

We got a pretty good results!!

Generate the cluster of each abstract in the the DataFrame

In [None]:
df1['cluster']=y
df1.head()

In [None]:
#Generate the size of each cluster
df1.groupby('cluster').apply(len)