# Reading and Cleaning text

In [None]:
## Importing the necessary libraries
import pandas as pd
import re
import string
import nltk

## Increasing the width of the output display
pd.set_option('display.max_colwidth',100) 

## setting the file path
file=r" "

## reading the csv file
data=pd.read_csv(file, engine='python')

##Initializing Stopwords, Stemmer and Lemmatizer functions
stopword = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()
# stop = ['said','would']

##Create function to remove punctuation, remove numbers, tokenize, remove stopwords and lemmatize
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    num = re.sub('[0-9]+','',text)
    text = "".join([word for word in text if word in num])
    text = " ".join([word for word in text.split() if len(word)>2])
    ##text = " ".join([word for word in text.split() if word not in stop])
    tokens = re.split('\W+',text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopword]
    return text

data['News_2'] = data['News'].apply(lambda x: clean_text(x))
data.head()

# Vectorization - Convert to numeric form

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

## Apply CountVectorizer
count_vect = CountVectorizer(analyzer = clean_text)
X_counts = count_vect.fit_transform(data['News'])
print(X_counts.shape)

## Create Sparse matrix
X_counts_df = pd.DataFrame(X_counts.toarray())
X_counts_df.columns = count_vect.get_feature_names()
X_counts_df.head()
# To know frequency of specific word
# X_counts_df.facebook.sum()

# Word Frequencies

In [None]:
## Summation of word frequencies
sum_words = X_counts.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in count_vect.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)

## Select the top ‘X’ frequency words (eg: here X=10)
dat = words_freq[:10]

dat_df=pd.DataFrame(dat)
dat_df.columns=['word','freq']

## Plot the top frequency words
import matplotlib.pyplot as plt
plt.figure(figsize = (15,10))
plt.bar(dat_df['word'],dat_df['freq'])
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.savefig("wordfreq.png", bbox_inches='tight')
plt.show()

## Print the top 'X' frequency words
# dat_df

# Generate wordcloud

In [None]:
##Create wordcloud
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

##abc="said","mr","company","deal"
# stopwords = set(abc)
##wordcloud = WordCloud(background_color='white',stopwords=stopwords).generate(str(words_freq))
wordcloud = WordCloud(max_words=100,background_color='white',width=600,height=300).generate(str(words_freq))
plt.figure(figsize=(6,2),dpi=200)
plt.imshow(wordcloud,interpolation="bilinear")
plt.axis("off")
plt.savefig("wordcloud.png", bbox_inches='tight')
plt.show()

# Sentiment Analysis

In [None]:
## Sentiment Analysis using TextBlob
from textblob import TextBlob
import seaborn as sns

bloblist_desc = list()

news_str = data['News_2'].astype(str)
for row in news_str:
    blob = TextBlob(row)
    bloblist_desc.append((row,blob.sentiment.polarity, blob.sentiment.subjectivity))
    polarity_desc = pd.DataFrame(bloblist_desc, columns = ['sentence','polarity','subjectivity'])

def f(polarity_desc):
    if polarity_desc['polarity'] > 0:
        val = "Positive"
    elif polarity_desc['polarity'] == 0:
        val = "Neutral"
    else:
        val = "Negative"
    return val

polarity_desc['Sentiment_Type'] = polarity_desc.apply(f, axis = 1)

## Create sentiment scores csv file
df1 = pd.DataFrame(polarity_desc)
my_df2 = pd.DataFrame(data)
df2 = pd.DataFrame(my_df2.iloc[:,:4])
df = pd.concat([df2,df1],axis = 1)
df.to_csv("scores.csv",index=False)

## Plot sentiment scores
plt.figure(figsize = (10,10))
sns.set_style("whitegrid")
ax = sns.countplot(x = "Sentiment_Type", data = polarity_desc)
plt.savefig('sentiment.png')

# Topic Modeling__Latent Dirichlet Allocation (LDA)

In [None]:
import gensim
from gensim import corpora
import pyLDAvis
import pyLDAvis.gensim
import warnings
warnings.filterwarnings('ignore')


dictionary = corpora.Dictionary(data['News_2'])
doc_term_matrix = [dictionary.doc2bow(rev) for rev in data['News_2']]
LDA = gensim.models.ldamodel.LdaModel

#num_topics indicates number of topics
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=5, random_state=100,
                chunksize=1000, passes=5)

# lda_model.print_topics()

# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, doc_term_matrix, dictionary)
pyLDAvis.save_html(vis, 'lda.html')
pyLDAvis.show(vis)