<a href="https://colab.research.google.com/github/shreyakabra/TextAnalysisWithPython/blob/main/TextAnalysisWithPython.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import plotly.express as px
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from textblob import TextBlob
import spacy
from collections import defaultdict
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

NLP=spacy.load('en_core_web_sm')
data=pd.read_csv("articles.csv", encoding='latin-1')
data.head()

Unnamed: 0,Article,Title
0,Data analysis is the process of inspecting and...,Best Books to Learn Data Analysis
1,The performance of a machine learning algorith...,Assumptions of Machine Learning Algorithms
2,You must have seen the news divided into categ...,News Classification with Machine Learning
3,When there are only two classes in a classific...,Multiclass Classification Algorithms in Machin...
4,The Multinomial Naive Bayes is one of the vari...,Multinomial Naive Bayes in Machine Learning


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
t#Combining titles into a single string
texttitle=' '.join(data['Title'])

#Creating a WordCloud object
wordcloud=WordCloud(width=800, height=400, background_color= '#F5F5DC').generate(texttitle)

#plotting the wordcloud
f1=px.imshow(wordcloud,title="WordCloud of Titles")
f1.update_layout(showlegend=False)
f1.show()

In [None]:
# Sentiment Analysis
data['Sentiment'] = data['Article'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Sentiment Distribution
f2 = px.histogram(data, x='Sentiment', title='Sentiment Analysis Distribution')
f2.show()

In [None]:
# NER

def ExtractNamedEntities(text):
    doc = NLP(text)
    entities = defaultdict(list)
    for ent in doc.ents:
        entities[ent.label_].append(ent.text)
    return dict(entities)

data['Named_Entities'] = data['Article'].apply(ExtractNamedEntities)

# Visualize NER
entitycount = Counter(entity for entities in data['Named_Entities'] for entity in entities)
entitydf = pd.DataFrame.from_dict(entitycount, orient='index').reset_index()
entitydf.columns = ['Entity', 'Count']

f3 = px.bar(entitydf.head(10), x='Entity', y='Count', title='Named Entity Recognition')
f3.show()

In [None]:
# Topic Modeling
vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
tm = vectorizer.fit_transform(data['Article'])
ldamodel = LatentDirichletAllocation(n_components=5, random_state=42)
ldatopicmatrix = ldamodel.fit_transform(tm)

# Visualize topics
topicnames = ["Topic " + str(i) for i in range(ldamodel.n_components)]
data['Dominant Topic'] = [topicnames[i] for i in ldatopicmatrix.argmax(axis=1)]

fig = px.bar(data['Dominant Topic'].value_counts().reset_index(), x='index', y='Dominant Topic', title='Topic Distribution')
fig.show()