In [136]:
# Load Library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib.cm import rainbow
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')


In [137]:
import time
import numpy as np
import pandas as pd
import pycountry as pc
from datetime import datetime, timedelta
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
today = datetime.today()
day   = today.day if today.day > 9 else '0' + str(today.day)
month = today.month if today.month > 9 else '0' + str(today.month)
today_str = '{}/{}/{}'.format(day, month, today.year)

# Fetch data
df = pd.read_csv("../input/monkeypox-scientific-literature/monkeypox_abstracts_dataset.tsv",sep='\t')


# Initial Dataset Check

In [138]:
df.head()

In [139]:
df.info()


In [140]:
#Lets now check for null fields
import seaborn as sns
sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')


* No null value

In [141]:
df.isnull().sum()

In [142]:
# Duplicates VALUE 
print(f'Duplicates in the dataset: {df.duplicated().sum()}')
print(f'Percentage of duplicates: {df.duplicated().sum()/len(df)*100}%')

In [143]:
#Cardinality 
df.nunique()

In [144]:
#Data Types 
df.dtypes

# Target Distribution OF Dataset

In [145]:
# Figure size 
plt.figure(figsize=(30,30))
# Pie plot
df['PublicationType'].value_counts().plot.pie(autopct='%1.1f%%', textprops={'fontsize':12}).set_title("Target distribution")


In [146]:
df['PublicationType'].value_counts()

In [147]:
# data describtion
df.describe().T.style.background_gradient()

In [148]:
df.plot(kind='density', subplots=True, layout=(4,3), sharex=False, 
                     sharey=False,fontsize=12, figsize=(20,10))


# Study The Dataset Column Contribution

In [149]:
#study the data
sns.set_context('poster', font_scale=0.5)
df.hist(bins=25, grid=False, figsize=(25,18), color='#86bf91', zorder=2, rwidth=0.9)
plt.show()

In [150]:
plt.close()
sns.set_style('whitegrid')
sns.pairplot(df,hue='PublicationType',height=10);
plt.show()

# Use of NLP methods on Target columns to get Information 

In [151]:
# important Library for NLP
import warnings
warnings.filterwarnings('ignore')
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from sklearn.cluster import DBSCAN
import scipy
from scipy.cluster import hierarchy
from sklearn.cluster import AgglomerativeClustering

In [152]:
print("Non-null Value")
df = df[df['Title'].notnull()]
df=df.dropna(how='any')
df

# Target NLP Column ['Title']

In [153]:

nltk.download('stopwords')
sno = nltk.stem.SnowballStemmer('english')
stop=set(stopwords.words('english'))
def cleanpunc(sentence):
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned
print(stop)
print('************************************')
print(sno.stem('japan'))

In [154]:
i=0
str1=' '
final_string=[]
all_positive_words=[] # store words from +ve reviews here
all_negative_words=[] # store words from -ve reviews here.
s=''
for sent in df['Title'].values:
    filtered_sentence=[]
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):    
                if(cleaned_words.lower() not in stop):
                    s=(sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                else:
                    continue
            else:
                continue 
    #print(filtered_sentence)
    str1 = b" ".join(filtered_sentence) #final string of cleaned words
    #print("***********************************************************************")
    
    final_string.append(str1)
    i+=1


In [155]:
df['Title2']=final_string
df['Title2']=df['Title2'].str.decode("utf-8")
df['Title2']


# Clustering on Title column, 1)-KMeans by Bag of Words(BOW)

In [156]:
count_vect = CountVectorizer()
bow = count_vect.fit_transform(df['Title2'].values)
bow.shape
bow


terms = count_vect.get_feature_names()
terms[1:10]

In [157]:
model = KMeans(n_clusters = 10,init='k-means++', random_state=99)
model.fit(bow)

In [158]:
labels = model.labels_
cluster_center=model.cluster_centers_
cluster_center

In [159]:
print("silhouette_score")
silhouette_score = metrics.silhouette_score(bow, labels, metric='euclidean')
silhouette_score

In [160]:
df['Bow Clus Label'] = model.labels_
df.head(2)

In [161]:
print(" Nomber Of Clusters")
plt.bar([x for x in range(10)], df.groupby(['Bow Clus Label'])['Title2'].count(), alpha = 0.4)
plt.title('KMeans cluster pounts')
plt.xlabel("Cluster number")
plt.ylabel("Number of points")
plt.show()

# # Title column Words in the cluster

In [162]:
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = count_vect.get_feature_names()
for i in range(10):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
        print()

# Grouping Titles clusters on the ['PublicationType']

In [163]:
print("Country Base Clusters And PublicationType ")
df_press_clus=df.groupby(['Bow Clus Label','PublicationType'])['Title2'].sum()
df_press_clus=df_press_clus.reset_index()
df_press_clus=df_press_clus.set_index('PublicationType')
df_press_clus

# Grouping Titles clusters on the Number Citations

In [164]:
print("Name Base Clusters And NumberCitations ")
df_key_clus=df.groupby(['Bow Clus Label','NumberCitations'])['Title2'].sum()
df_key_clus=df_key_clus.reset_index()
df_key_clus=df_key_clus.set_index('NumberCitations')
df_key_clus

# 2)-KMeans by TF-IDF

In [165]:
tfidf_vect = TfidfVectorizer()
tfidf = tfidf_vect.fit_transform(df['Title2'].values)
tfidf.shape

In [166]:
model_tf = KMeans(n_clusters = 10,random_state=99)
model_tf.fit(tfidf)

In [167]:
labels_tf = model_tf.labels_
cluster_center_tf=model_tf.cluster_centers_
cluster_center_tf

In [168]:
terms1 = tfidf_vect.get_feature_names()
terms1[1:10]

In [169]:
silhouette_score_tf = metrics.silhouette_score(tfidf, labels_tf, metric='euclidean')
silhouette_score_tf

In [170]:
df1 = df
df1['Tfidf Clus Label'] = model_tf.labels_
df1.head(5)

In [171]:
df1.groupby(['Tfidf Clus Label'])['Authors'].count()

In [187]:
df1.groupby(['Tfidf Clus Label'])['JournalCitation'].count()

In [188]:
print("Top terms per cluster:")
order_centroids = model_tf.cluster_centers_.argsort()[:, ::-1]
for i in range(10):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms1[ind], end='')
        print()

In [173]:
print(" Nomber Of Clusters")
plt.bar([x for x in range(10)], df1.groupby(['Tfidf Clus Label'])['Authors'].count(), alpha = 0.4)
plt.title('KMeans cluster points')
plt.xlabel("Cluster number")
plt.ylabel("Number of points")
plt.show()

# Title and DOI column clusters

In [192]:
print('Review Assigned To  Cluster ')
for i in range(10):
    print("4 review of assigned to cluster ", i)
    print("-" * 70)
    print(df1.iloc[df1.groupby(['Tfidf Clus Label']).groups[i][5]]['DOI'])
    print('\n')
    print(df1.iloc[df1.groupby(['Tfidf Clus Label']).groups[i][10]]['DOI'])
    print('\n')
    print(df1.iloc[df1.groupby(['Tfidf Clus Label']).groups[i][20]]['DOI'])
    print('\n')
    print("_" * 70)

# Title and  PublicationType column clusters

In [191]:
print('Review Assigned To  Cluster ')
for i in range(10):
    print("4 review of assigned to cluster ", i)
    print("-" * 70)
    print(df1.iloc[df1.groupby(['Tfidf Clus Label']).groups[i][5]]['PublicationType'])
    print('\n')
    print(df1.iloc[df1.groupby(['Tfidf Clus Label']).groups[i][10]]['PublicationType'])
    print('\n')
    print(df1.iloc[df1.groupby(['Tfidf Clus Label']).groups[i][20]]['PublicationType'])
    print('\n')
    print("_" * 70)

# Title and  JournalCitation column clusters

In [190]:
print('Review Assigned To  Cluster ')
for i in range(10):
    print("4 review of assigned to cluster ", i)
    print("-" * 70)
    print(df1.iloc[df1.groupby(['Tfidf Clus Label']).groups[i][5]]['JournalCitation'])
    print('\n')
    print(df1.iloc[df1.groupby(['Tfidf Clus Label']).groups[i][10]]['JournalCitation'])
    print('\n')
    print(df1.iloc[df1.groupby(['Tfidf Clus Label']).groups[i][20]]['JournalCitation'])
    print('\n')
    print("_" * 70)

# Title and  Auther column cluster

In [174]:
print('Review Assigned To Auther Cluster ')
for i in range(10):
    print("4 review of assigned to cluster ", i)
    print("-" * 70)
    print(df1.iloc[df1.groupby(['Tfidf Clus Label']).groups[i][5]]['Authors'])
    print('\n')
    print(df1.iloc[df1.groupby(['Tfidf Clus Label']).groups[i][10]]['Authors'])
    print('\n')
    print(df1.iloc[df1.groupby(['Tfidf Clus Label']).groups[i][20]]['Authors'])
    print('\n')
    print("_" * 70)

# NLP on ['Abstract'] Column

In [175]:
# important library
# Text Cleaning
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.classify import NaiveBayesClassifier
from textblob import Word
from textblob import TextBlob
from collections import Counter
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import re
import csv


In [176]:
corpus = []
for i in range(0, 728):
    #### Removing punctuation and special character ####
    tweet = re.sub('[^a-zA-Z0-9]',' ', df['Abstract'][i])
    tweet = re.sub(r'[^\w]', ' ', tweet)
    # Removeing hyperlink
    tweet = re.sub(r"http\S+", "", tweet)
    # Convertin text in lower case
    tweet = tweet.lower()
    # Replace some unnessesary characters
    tweet = tweet.replace("rt ", "")
    tweet = tweet.replace("co ", "")
    tweet = tweet.replace("' ", "")
    tweet = tweet.replace("u'", "")
    tweet = tweet.replace(" ' ", "")
    #print(tweet)     
    tweet = tweet.split()
    # Apply lemmatization method to convert word in actual form
    lemmatizer = WordNetLemmatizer()
    tweet = [lemmatizer.lemmatize(word) for word in tweet]
    tweet = filter(lambda x: x.isalpha(), tweet)
    tweet = ' '.join(tweet)
    # df1=df['Tweet']
    # df1.iloc[i]=tweet
    # df['Tweet']=df1

    corpus.append(tweet)
#print((corpus))
#df.isna().sum()

In [177]:
#df.isna().sum()
import nltk
nltk.download('punkt')


In [178]:
# Define positive, negative and neutral variable with 0 value
pos = 0
neg = 0
nue = 0

# defining blank list
pol = list()
scores1 = list()
total1 = list()
Positive = list()
Neutral = list()
Negative = list()

for comment in corpus:
    blob = TextBlob(comment)
    #print(blob.sentences)
    for sent in blob.sentences:
        #print(sent)
        score = sent.sentiment.polarity
        if score > 0:
            #print('positive')
            pos = pos+1
        elif score == 0:
            #print('neutral')
            nue = nue+1
        else:
            #print('negative')
            neg = neg+1
        scores1.append(score)
        
s = len(corpus)
total = str((pos)+(nue)+(neg))
total1.append(total)
p = float(pos)

# Calculating percentage of positive tweets
asd_per = float((p/ float(s)) * 100)
Positive.append(asd_per)

# Calculating percentage of neutral tweets
nu = float(nue)
asd_per = float((nu/ float(s)) * 100)
Neutral.append(asd_per)

# Calculating percentage of negative tweets
neg = float(neg)
asd_per = float((neg/ float(s)) * 100)
Negative.append(asd_per)
        

totl = " Positive = " + str(pos) + "," +"Neutral = " + str(nue) + "," +"Negative = " + str(neg)
pol.append(totl)
d = {'Positive': Positive, 'Neutral': Neutral, 'Negative': Negative}
p#rint(d)
df = pd.DataFrame(d)
df = df.transpose()
df.columns = ['Score']
print(df)

In [179]:
plt.figure(figsize=(16,8))
ax1 = plt.subplot(121, aspect='equal')
df.plot.pie(subplots=True,ax=ax1,autopct='%1.1f%%',startangle=90, shadow=False,legend = False, fontsize=14)
plt.show()

In [180]:
from textblob import TextBlob
TextBlob (corpus[3]).words

In [181]:
# computer the frequency of all words
frequency_dist = nltk.FreqDist (corpus[:30])
# frequency_dist
sorted_frequency_dist = sorted (frequency_dist, key = frequency_dist.__getitem__, reverse = True)
# sorted_frequency_dist

In [182]:
 word_cloud = WordCloud(
                    background_color='white',
                    stopwords=set(STOPWORDS),
                    max_words=50,
                    max_font_size=40,
                    scale=5,
                    random_state=1).generate(str(corpus))
fig = plt.figure(1, figsize=(15,15))
plt.axis('off')
fig.suptitle('Word Cloud for top 50 prevelant words in Dataset', fontsize=20)
fig.subplots_adjust(top=2.3)
plt.imshow(word_cloud)
plt.show()

# NLP on ['Keywords'] column


In [183]:
corpus = []
for i in range(0, 728):
    #### Removing punctuation and special character ####
    tweet = re.sub('[^a-zA-Z0-9]',' ', df1['Keywords'][i])
    tweet = re.sub(r'[^\w]', ' ', tweet)
    # Removeing hyperlink
    tweet = re.sub(r"http\S+", "", tweet)
    # Convertin text in lower case
    tweet = tweet.lower()
    # Replace some unnessesary characters
    tweet = tweet.replace("rt ", "")
    tweet = tweet.replace("co ", "")
    tweet = tweet.replace("' ", "")
    tweet = tweet.replace("u'", "")
    tweet = tweet.replace(" ' ", "")
    #print(tweet)     
    tweet = tweet.split()
    # Apply lemmatization method to convert word in actual form
    lemmatizer = WordNetLemmatizer()
    tweet = [lemmatizer.lemmatize(word) for word in tweet]
    tweet = filter(lambda x: x.isalpha(), tweet)
    tweet = ' '.join(tweet)
    # df1=df['Tweet']
    # df1.iloc[i]=tweet
    # df['Tweet']=df1

    corpus.append(tweet)
#print((corpus))

In [184]:
TextBlob (corpus[3]).words

In [185]:
# computer the frequency of all words
frequency_dist = nltk.FreqDist (corpus[:30])
frequency_dist
sorted_frequency_dist = sorted (frequency_dist, key = frequency_dist.__getitem__, reverse = True)
sorted_frequency_dist

In [186]:
 word_cloud = WordCloud(
                    background_color='white',
                    stopwords=set(STOPWORDS),
                    max_words=50,
                    max_font_size=40,
                    scale=5,
                    random_state=1).generate(str(corpus))
fig = plt.figure(1, figsize=(15,15))
plt.axis('off')
fig.suptitle('Word Cloud for top 50 prevelant words in Keyword column Dataset', fontsize=20)
fig.subplots_adjust(top=2.3)
plt.imshow(word_cloud)
plt.show()