In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount= True)

In [None]:
#open data file
import pandas as pd
data = pd.read_csv('/content/drive/MyDrive/data2.csv',sep = ',')

In [None]:
#show number of documents in each year
data.groupby(['Year']).count()

In [None]:
#clean text

import re
import gensim
from gensim.summarization.textcleaner import split_sentences
dt = []
for text in data["Abstract"]:
    xx1 =  split_sentences(text,)
    dt.append(xx1[:-1])

mydata = []
for i in dt:
    result = ".".join(i)
    mydata.append(result)


def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub('\S*@\S*\s?', '', sent)  
        sent = re.sub('\s+', ' ', sent)  
        sent = re.sub("\'", "", sent)  
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)  

data_abstracts = mydata
data_words = list(sent_to_words(data_abstracts))

In [None]:
#do function for delete stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

stop_words = stopwords.words('english')
stop_words.extend(['This study aimed','This study aimed at finding','This study','The student of','For this purpose',
'The responses show','The responses','It was also found that','It was also','Although this survey','The paper has',
'The paper also','The paper has investigated','The findings also indicated that','The findings also','The finding',  
'In addition the','In addition to','the study observed','In addition the study observed','Though the','The user friendliness of',
'Accessibility','The purpose of this research is to study the','The purpose of this research','The research','The development of', 
'This paper aims','This paper aims to','The findings of this research','This paper presents','the Theory of','The method','The advantages of', 
'The scope of the','This article','We employed','we show','This article examines','This chapter describes','This chapter','we have','Http','www'])

data_words = [[word for word in doc if word not in stop_words and len(word) >= 2] for doc in data_words]

In [None]:
#save words of each document
import csv
with open('/content/drive/My Drive/x.csv','w',newline='',encoding= 'utf-8')as myfile:
     wr=csv.writer(myfile, quoting=csv.QUOTE_ALL)
     for d in data_words:
         wr.writerow(d)

In [None]:
#make a trigram from words and stemming

import gensim
from nltk.stem.porter import*
import spacy

bigram = gensim.models.Phrases(data_words, min_count=3, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
   
    nlp = spacy.load('en_core_web_sm',disable=['parser', 'ner'])
    nlp.max_length = 10000000 
    
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([PorterStemmer().stem(token.lemma_) for token in doc if token.pos_ in allowed_postags])
   
    texts_out = [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

data_ready = process_words(data_words)  

In [None]:
#save ready data for input to model

import _pickle as pickle
with open('/content/drive/My Drive/data_ready.p', 'wb') as f:
 data = pickle.dump(data_ready,f)

In [None]:
#load ready data and make corpus from them and change the corpus with tfidf method
import _pickle as pickle
import gensim.corpora as corpora
from gensim import corpora, models

with open('/content/drive/My Drive/data_ready.p', 'rb') as f:
    data_ready = pickle.load(f)
id2word = corpora.Dictionary(data_ready)
texts = data_ready
corpus = [id2word.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [None]:
#evaluate models in defined range

from gensim.models import CoherenceModel
import numpy as np

def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    
    coherence_values = []
    log_perplexity = []
    model_list = []
    eval_liklihood = []
    for num_topics in range(start, limit, step):
        model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           alpha='symmetric',
                                           eta=.001,
                                           iterations=100,
                                           workers=4,
                                           dtype=np.float64
                                           )        
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model,texts=texts, corpus=corpus, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        log_perplexity.append(model.log_perplexity(corpus))
        eval_liklihood.append(model.bound(corpus))
        print(num_topics)
        
    return model_list, coherence_values,log_perplexity,eval_liklihood

model_list_tfidf, coherence_values_tfidf,log_perplexity_tfidf,eval_liklihood_tfidf = compute_coherence_values(dictionary=id2word, corpus=corpus_tfidf,texts=data_ready, start=260, limit=290, step=5)



In [None]:
# save the models

i = 260
for model in model_list_tfidf:
 s = '/content/drive/MyDrive/models/model%s'%i
 model.save(s)
 i+=5

In [None]:
#plot coherence of each model

import matplotlib.pyplot as plt
%matplotlib inline

limit=290; start=260; step=5;
x = range(start, limit, step)
plt.plot(x, coherence_values_tfidf)
plt.xlabel("Num Topics")
plt.ylabel("coherence_values_tfidf")
plt.legend(("coherence_values_tfidf"), loc='best')
plt.show()

In [None]:
#plot logarithm of preplexity of each model

import matplotlib.pyplot as plt
%matplotlib inline

# Show graph
limit=290; start=260; step=5;
x = range(start, limit, step)
plt.plot(x, log_perplexity_tfidf)
plt.xlabel("Num Topics")
plt.ylabel("log_perplexity_tfidf")
plt.legend(("log_perplexity_tfidf"), loc='best')
plt.show()

In [None]:
#plot liklihood for each model

import matplotlib.pyplot as plt
%matplotlib inline

# Show graph
limit=290; start=260; step=5;
x = range(start, limit, step)
plt.plot(x, eval_liklihood_tfidf)
plt.xlabel("Num Topics")
plt.ylabel("eval_liklihood_tfidf")
plt.legend(("eval_liklihood_tfidf"), loc='best')
plt.show()

In [None]:
#print all topics with 15 most revelant words

from pprint import pprint
#select the model and print the topic
optimal_model = LdaMulticore.load('/content/drive/MyDrive/models/model260')
optimal_model.show_topics(formatted=False)
optimal_model.print_topics(-1,15)

In [None]:
#details of each document

import pandas as pd

def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()
    print(len(ldamodel[corpus]))
    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
        print(i)
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus_tfidf, texts=data['Abstract'])

#Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

#Show
df_dominant_topic.to_csv('/content/drive/MyDrive/details of each document.csv', index=False)
df_dominant_topic


In [None]:
#details of each topic

import pandas as pd
import numpy as np

dataframe = pd.read_csv('/content/drive/MyDrive/details of each document.csv')
start = 0.0
end = 260.0
all_count = dataframe[dataframe['Dominant_Topic'].notnull()].shape[0]
ls = []
while start < end:
  count = dataframe[dataframe['Dominant_Topic'] == start]
  ls.append([start, count.shape[0], count.iloc[0]['Topic_Keywords'], round(count.shape[0]/all_count, 4)])
  start += 1
new_dataframe = pd.DataFrame(ls, columns=['Dominant_topic', 'Number_of_documents', 'Topic_Keywords', 'percentage_of_topic_documents'])
new_dataframe.head(260)
new_dataframe.to_csv('/content/drive/MyDrive/details_of_topics.csv', index=False)
new_dataframe

In [None]:
#distibution of each document per words

import matplotlib.pyplot as plt
import numpy as np

doc_lens = [len(d) for d in data_ready]

# Plot
plt.figure(figsize=(16,7), dpi=160)
plt.hist(doc_lens, bins = 1000, color='navy')
plt.text(450, 750, "Mean   : " + str(round(np.mean(doc_lens))))
plt.text(450, 700, "Median : " + str(round(np.median(doc_lens))))
plt.text(450, 650, "Stdev   : " + str(round(np.std(doc_lens))))

plt.gca().set(xlim=(0, 500), ylabel='Number of Documents', xlabel='Document Word Count')
plt.tick_params(size=16)
plt.xticks(np.linspace(0,500,9))
plt.title('Distribution of Document Word Counts', fontdict=dict(size=22))
plt.savefig("/content/drive/MyDrive/figures/dodwc1.png")
plt.show()

In [None]:
#number of documents of each topic in year

import pandas as pd

df_data = pd.read_csv('/content/drive/MyDrive/data2.csv')
df_data_1 = pd.read_csv('/content/drive/MyDrive/details of each document.csv')
df_data_1 = df_data_1.join(df_data['Year'])
df_data_1 = df_data_1[df_data_1['Dominant_Topic'].notnull()]
temp_list = []
for i in range (260):
  ls = [i]
  for j in range (2010, 2020):
    ls.append(df_data_1[(df_data_1['Year']==j) & (df_data_1['Dominant_Topic'] == float(i))].shape[0])
  temp_list.append(ls)
new_df = pd.DataFrame(data=temp_list, columns=['topics','2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019'])
new_df.to_csv('/content/drive/MyDrive/topics_per_year_count.csv', index=False)

In [None]:
#kmeans clustering(if did this don't run again)

import _pickle as pickle
import gensim.corpora as corpora
from gensim.matutils import corpus2csc,corpus2dense
from scipy.sparse import csc_matrix
import numpy as np
from sklearn.cluster import KMeans
from gensim import corpora, models
with open ('/content/drive/MyDrive/data_ready.p','rb')as f:
    data_ready = pickle.load(f)


id2word = corpora.Dictionary(data_ready)
texts = data_ready
corpus = [id2word.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
num_docs = id2word.num_docs
num_terms = len(id2word.keys())

corpus_tfidf_sparse = corpus2csc(corpus_tfidf, num_terms=num_terms, num_docs=num_docs)

#save sparse matrix
with open('/content/drive/MyDrive/ccorpus_tfidf_sparse.p', 'wb') as f:
    data = pickle.dump(corpus_tfidf_sparse,f)


#make kmeans model
model = KMeans(n_clusters=260)
clusters = model.fit_predict(corpus_tfidf_sparse.T)

#save the kmeans model
with open('/content/drive/MyDrive/k_means', 'wb') as f:
    saved_model = pickle.dump(model, f)

In [None]:
#k-means clustring details

import _pickle as pickle
from sklearn.cluster import KMeans
import csv

with open('/content/drive/MyDrive/k_means', 'rb') as f:
      kmeans_model = pickle.load(f)
      print(kmeans_model.labels_) #clusters for each document
file = open('/content/drive/MyDrive/k_means_results.csv', 'w')
k_means_result = [[result] for result in kmeans_model.labels_]
print(len(k_means_result))
write = csv.writer(file)
write.writerows(k_means_result)

In [None]:
#linear regression

import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

frame = pd.read_csv('/content/drive/MyDrive/topics_per_year_count.csv')

topic_label = 10  #replace 10 with a topic you want to have test regression

X = frame.iloc[:, 1:]
x = list(map(int, list(frame.columns[1:]))) 
y = list(X.iloc[topic_label]) 

temp_list = []
for i in x:
  temp_list.append([i])
x = temp_list

regressor = LinearRegression()
regressor.fit(x, y)
#print(regressor.predict[[2020]]) #replace 2020 with a year number you want to predict

y_predict = regressor.predict(x) 
plt.figure(figsize=(12,10))
plt.plot(x, y, 'co', label='data')
# Plotting the fitted prediction line
plt.plot(x, y_predict, linewidth=3.0, label='predicted')
plt.plot(x, y, linewidth=3.0, label='trend', color='g')
plt.legend(loc='best')
plt.ylabel('number of documents for topic '+str(topic_label), color='g', fontsize=18)
plt.xlabel('Year', color='g', fontsize=18)
plt.xticks(color = 'y')
plt.yticks(color = 'y')
plt.savefig('/content/drive/MyDrive/linear_regression_model.png') #for saving uncomment this line
plt.show()

In [None]:
#nonlinear regression with sigmoid method

import numpy as np
import pandas as pd
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt

def sigmoid(X, Beta_1, Beta_2):
    y = 1 / (1 + np.exp(-Beta_1*(X-Beta_2)))
    return y


frame = pd.read_csv('/content/drive/MyDrive/topics_per_year_count.csv')

topic_label = 10 #replace 10 with a topic you want to have test regression

X = frame.iloc[:, 1:]
x = list(map(int, list(frame.columns[1:]))) 
y = list(X.iloc[topic_label]) 


x_max = max(x)
y_max = max(y)

x = [item / x_max for item in x]
y = [item / y_max for item in y]

popt, pcov = curve_fit(sigmoid, x, y)

#print(sigmoid(2020/x_max, *popt)*y_max) #replace 2020 with a year number you want to predict

y_hat = sigmoid(x, *popt)
plt.figure(figsize=(12,10))
plt.plot([int(item*x_max) for item in x], [int(item*y_max) for item in y], 'co', label='data')
# Plotting the fitted prediction line
plt.plot([int(item*x_max) for item in x], [int(item*y_max) for item in y_hat], linewidth=3.0, label='predicted')
plt.plot([int(item*x_max) for item in x], [int(item*y_max) for item in y], linewidth=3.0, label='trend', color='g')

plt.legend(loc='best')
plt.ylabel('number of documents for topic '+str(topic_label), color='g', fontsize=18)
plt.xlabel('Year', color='g', fontsize=18)
plt.xticks(color = 'y')
plt.yticks(color = 'y')
plt.savefig('/content/drive/MyDrive/nonlinear_regression_model'+str(topic_label)+'.png') #for saving uncomment this line
plt.show()

In [None]:
#information of document term matrix

from itertools import chain 
import _pickle as pickle
import csv

with open('/content/drive/MyDrive/data_ready.p', 'rb') as f:
    data_ready = pickle.load(f)
sum_of_empty_blocks = 0
for i in data_ready:
    sum_of_empty_blocks += (42473 - len(set(i)))

flatten_list = list(chain.from_iterable(data_ready))

print('Document term matrix information:')
print('matrix size: 50995*42473')
print('length of longest word:',len(max(flatten_list, key=len)))
print('sum of empty blocks: ', sum_of_empty_blocks)
print('sum of full blocks:', 50995*42473 - sum_of_empty_blocks)

In [None]:
#show world cloud for each topic

from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors
from pprint import pprint
from gensim.models import LdaMulticore, tfidfmodel
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))
stop_words = stopwords.words('english')
stop_words.extend(['This study aimed','This study aimed at finding','This study','The student of','For this purpose',
'The responses show','The responses','It was also found that','It was also','Although this survey','The paper has',
'The paper also','The paper has investigated','The findings also indicated that','The findings also','The finding',  
'In addition the','In addition to','the study observed','In addition the study observed','Though the','The user friendliness of',
'Accessibility','The purpose of this research is to study the','The purpose of this research','The research','The development of', 
'This paper aims','This paper aims to','The findings of this research','This paper presents','the Theory of','The method','The advantages of', 
'The scope of the','This article','We employed','we show','This article examines','This chapter describes','This chapter','we have','Http','www'])


optimal_model = LdaMulticore.load('/content/drive/MyDrive/models/model260')
# print(optimal_model.show_topics(260,10,formatted=True))

cols = [color for name, color in mcolors.XKCD_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=10,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = optimal_model.show_topics(260,100,formatted=False)

fig, axes = plt.subplots(130, 2, figsize=(10,500), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
# plt.savefig('/content/drive/MyDrive/word_cloud.png') #for save uncomment this line
plt.show()

In [None]:
#visualize the data of lda model with corpus

!pip install pyLDAvis
import pyLDAvis.gensim
from gensim.models import LdaMulticore, tfidfmodel
from gensim import corpora, models
import _pickle as pickle
import gensim.corpora as corpora

with open('/content/drive/MyDrive/data_ready.p', 'rb') as f:
    data_ready = pickle.load(f)

id2word = corpora.Dictionary(data_ready)
texts = data_ready
corpus = [id2word.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

optimal_model = LdaMulticore.load('/content/drive/MyDrive/models/model260')

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(optimal_model, corpus_tfidf, dictionary=optimal_model.id2word)
vis


In [None]:
#theta values

import _pickle as pickle
import pandas as pd
from gensim import corpora, models
import gensim.corpora as corpora
from gensim.models import LdaMulticore, tfidfmodel
import numpy as np

with open('/content/drive/My Drive/data_ready.p', 'rb') as f:
    data_ready = pickle.load(f)

id2word = corpora.Dictionary(data_ready)
texts = data_ready
corpus = [id2word.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

optimal_model = LdaMulticore.load('/content/drive/MyDrive/models/model260')
df = pd.DataFrame(columns=[j for j in range(260)])
for i in range (len(corpus_tfidf)):
  x = optimal_model.get_document_topics(corpus_tfidf[i], -1)
  z = [r[1] for r in x]
  df.loc[i] = z

df.to_csv('/content/drive/MyDrive/theta_values.csv')
df.head()

In [None]:
#phi values

import _pickle as pickle
import pandas as pd
from gensim import corpora, models
import gensim.corpora as corpora
from gensim.models import LdaMulticore, tfidfmodel
import numpy as np

optimal_model = LdaMulticore.load('/content/drive/MyDrive/models/model260')

topics_terms = optimal_model.state.get_lambda()

#convert estimates to probability (sum equals to 1 per topic)
topics_terms_proba = np.apply_along_axis(lambda x: x/x.sum(),1,topics_terms)

# find the right word based on column index
words = [optimal_model.id2word[i] for i in range(topics_terms_proba.shape[1])]

#put everything together
fram = pd.DataFrame(topics_terms_proba,columns=words)
fram.to_csv('/content/drive/MyDrive/phi_values.csv')
fram

In [None]:
#corrolation

import _pickle as pickle
import pandas as pd
from gensim import corpora, models
import gensim.corpora as corpora
from gensim.models import LdaMulticore, tfidfmodel
import numpy as np

def plot_difference_matplotlib(mdiff, title="", annotation=None):
    """Helper function to plot difference between models.

    Uses matplotlib as the backend."""
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots(figsize=(20, 20))
    data = ax.imshow(mdiff, cmap='RdBu_r', origin='lower')
    plt.title(title)
    plt.colorbar(data)

optimal_model = LdaMulticore.load('/content/drive/MyDrive/models/model260')
corrolationofmodel , annotation= optimal_model.diff(optimal_model, distance='hellinger', num_words=83)
np.savetxt('/content/drive/MyDrive/corrolation.csv', corrolationofmodel)

plot_difference_matplotlib(corrolationofmodel, annotation=annotation)

In [None]:
#determine hot and cold topics

import _pickle as pickle
import pandas as pd
from gensim import corpora, models
import gensim.corpora as corpora
from gensim.models import LdaMulticore, tfidfmodel
import numpy as np
from scipy import stats
import csv

df_alpha = pd.read_csv('/content/drive/MyDrive/theta_values.csv', index_col='Unnamed: 0')
df_data = pd.read_csv('/content/drive/MyDrive/data2.csv')
df_alpha = df_alpha.join(df_data['Year'])
x = df_alpha.groupby(['Year']).mean()
z = [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]
x['year'] = z
z = x['year']
x = x.drop(['year'], axis=1)
x.to_csv('/content/drive/MyDrive/theta_means.csv')
index = [str(i) for i in range(260)]
result = []
p_values = []
for ind in index:  
    result.append(stats.linregress(z, x[ind])[0])
    p_values.append(stats.linregress(z, x[ind])[3])

all_result = [(i, slope) for i, slope in enumerate(result)]
p_level = 0.05 #change this p_level by 1-alpha
result_slope_neg = [(i, neg) for i, neg in enumerate(result) if neg < 0]
result_slope_pos = [(i, pos) for i, pos in enumerate(result) if pos >= 0]
total_p_value = [(i, p) for i, p in enumerate(p_values) if p < p_level]
significance_pos = [x for x in result_slope_pos for y in total_p_value if y[0]==x[0]]
significance_neg = [x for x in result_slope_neg for y in total_p_value if y[0]==x[0]]
not_significance = list(set(all_result) - set(significance_pos + significance_neg))
print('p-level: '+str(p_level)+' or confidence level: '+str(1-p_level))
print('total positive trend: ',len(significance_pos))
print('total_negative trend: ', len(significance_neg))
print('total neutral:', len(not_significance))
print('negative topics:', [(i[0]+1, i[1]) for i in significance_neg])
print('positive topics:', [(i[0]+1, i[1]) for i in significance_pos])
neutral = [(i[0]+1, i[1]) for i in not_significance]
neutral.sort(key=lambda x:x[0])
print('neutral topics:', neutral)

res = [[i] for i in result]
print('\nall slope results:', res)
file = open('/content/drive/MyDrive/all_slope_result.csv','w')
with file:     
    write = csv.writer(file) 
    write.writerows(res) 

In [None]:
#save negative topics pictures

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression


z = [[2010], [2011], [2012], [2013], [2014], [2015], [2016], [2017], [2018], [2019]]



f = pd.read_csv('/content/drive/MyDrive/theta_means.csv', index_col=['Year'])

for j in [i[0] for i in not_significance]:
    regressor = LinearRegression()
    regressor.fit(z, f.T.iloc[j])
    y_predict = regressor.predict(z)
    plt.figure(figsize=(12,10))
    # plt.plot(z, f.T.iloc[j], 'co', label='data')
    plt.plot(z, f.T.iloc[j], linewidth=3.0, color='g')
    plt.plot(z, y_predict, linewidth=3.0)
    plt.legend(loc='best')
    plt.ylabel('mean theta '+str(j+1), color='g', fontsize=18)
    plt.xlabel('Year', color='g', fontsize=18)
    plt.savefig('/content/drive/MyDrive/neutral/neutral_topic_'+str(j+1)+'.png')

In [None]:
#save negative topics pictures

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression


z = [[2010], [2011], [2012], [2013], [2014], [2015], [2016], [2017], [2018], [2019]]



f = pd.read_csv('/content/drive/MyDrive/theta_means.csv', index_col=['Year'])

for j in [i[0] for i in significance_neg]:
    regressor = LinearRegression()
    regressor.fit(z, f.T.iloc[j])
    y_predict = regressor.predict(z)
    plt.figure(figsize=(12,10))
    # plt.plot(z, f.T.iloc[j], 'co', label='data')
    plt.plot(z, f.T.iloc[j], linewidth=3.0, color='g')
    plt.plot(z, y_predict, linewidth=3.0)
    plt.legend(loc='best')
    plt.ylabel('mean theta '+str(j+1), color='g', fontsize=18)
    plt.xlabel('Year', color='g', fontsize=18)
    plt.savefig('/content/drive/MyDrive/neg/negative_topic_'+str(j+1)+'.png')

In [None]:
#save positive topics pictures

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression


z = [[2010], [2011], [2012], [2013], [2014], [2015], [2016], [2017], [2018], [2019]]



f = pd.read_csv('/content/drive/MyDrive/theta_means.csv', index_col=['Year'])

for j in [i[0] for i in significance_pos]:
    regressor = LinearRegression()
    regressor.fit(z, f.T.iloc[j])
    y_predict = regressor.predict(z)
    plt.figure(figsize=(12,10))
    # plt.plot(z, f.T.iloc[j], 'co', label='data')
    plt.plot(z, f.T.iloc[j], linewidth=3.0, color='g')
    plt.plot(z, y_predict, linewidth=3.0)
    plt.legend(loc='best')
    plt.ylabel('mean theta '+str(j+1), color='g', fontsize=18)
    plt.xlabel('Year', color='g', fontsize=18)
    plt.savefig('/content/drive/MyDrive/pos/positive_topic_'+str(j+1)+'.png')

In [None]:
#top 15 words of each topic with frequency or probablity

import _pickle as pickle
import pandas as pd
from gensim import corpora, models
import gensim.corpora as corpora
from gensim.models import LdaMulticore, tfidfmodel
import numpy as np
import csv

optimal_model = LdaMulticore.load('/content/drive/MyDrive/models/model260')
ls = []
for i in range(260):
    ls.append(optimal_model.show_topic(i, 15))

file = open('/content/drive/MyDrive/top_15_word_with_frequency.csv', 'w') 
  
# writing the data into the file 
with file:     
    write = csv.writer(file) 
    write.writerows(ls) 

In [None]:
#1000 words wordcloud with most probablity

import _pickle as pickle
import pandas as pd
from gensim import corpora, models
import gensim.corpora as corpora
from gensim.models import LdaMulticore, tfidfmodel
import numpy as np
import csv

optimal_model = LdaMulticore.load('/content/drive/MyDrive/models/model260')

terms = []

for i in range(260):
    temp = optimal_model.show_topic(i, 15)
    for term in temp:
        terms.append(term)

from os import path
import matplotlib.pyplot as plt
from wordcloud import WordCloud

def terms_to_wordcounts(terms, multiplier=1000):
    return  ' '.join([' '.join(int(multiplier*i[1]) * [i[0]]) for i in terms])

wc = WordCloud(background_color='white',width=1024, height=764, collocations=False).generate(terms_to_wordcounts(terms))

plt.figure(figsize=(20,15))
plt.imshow(wc)
plt.axis("off")
plt.savefig("terms1.png")
plt.show()