In [7]:
import pandas as pd
import plotly.express as px
from wordcloud import  WordCloud
import matplotlib.pyplot as plt
from textblob import TextBlob
import spacy
from collections import Counter
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import  LatentDirichletAllocation

nlp = spacy.load('en_core_web_sm')

In [8]:
data = pd.read_csv('/content/drive/MyDrive/DATASETS/articles.csv', encoding='latin-1')
data.head()

Unnamed: 0,Article,Title
0,Data analysis is the process of inspecting and...,Best Books to Learn Data Analysis
1,The performance of a machine learning algorith...,Assumptions of Machine Learning Algorithms
2,You must have seen the news divided into categ...,News Classification with Machine Learning
3,When there are only two classes in a classific...,Multiclass Classification Algorithms in Machin...
4,The Multinomial Naive Bayes is one of the vari...,Multinomial Naive Bayes in Machine Learning


In [9]:
# Combine all titles into a single string
titles_text = ' '.join(data['Title'])

# Create a WordCloud object
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(titles_text)

# Plot the Word Cloud
fig = px.imshow(wordcloud, title='Word Cloud of Titles')
fig.update_layout(showlegend=False)
fig.show()

In [10]:
# Sentiment Analysis
data['Sentiment'] = data['Article'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Sentiment Distribution
fig = px.histogram(data, x='Sentiment', title='Sentiment Distribution')
fig.show()

In [11]:
# NER
def extract_named_entities(text):
    doc = nlp(text)
    entities = defaultdict(list)
    for ent in doc.ents:
        entities[ent.label_].append(ent.text)
    return dict(entities)

data['Named_Entities'] = data['Article'].apply(extract_named_entities)

# Visualize NER
entity_counts = Counter(entity for entities in data['Named_Entities'] for entity in entities)
entity_df = pd.DataFrame.from_dict(entity_counts, orient='index').reset_index()
entity_df.columns = ['Entity', 'Count']

fig = px.bar(entity_df.head(10), x='Entity', y='Count', title='Top 10 Named Entities')
fig.show()

In [24]:
from nltk.corpus import stopwords  # Import stopwords from NLTK

# Assuming you have a DataFrame 'data' with an 'Article' column

# Define a function for text cleaning (replace with your specific cleaning steps)
def clean_text(text):
  # Lowercase text
  text = text.lower()
  # Remove punctuation
  text = re.sub(r'[^\w\s]', '', text)  # Regular expression for alphanumeric and whitespace
  # Remove stopwords (optional)
  stop_words = stopwords.words('english')
  text = ' '.join([word for word in text.split() if word not in stop_words])
  return text

# Clean the articles
cleaned_articles = data['Article'].apply(clean_text)

# Create the CountVectorizer (consider adjusting parameters)
vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
tf = vectorizer.fit_transform(cleaned_articles)

# Create the LDA model (adjust n_components as needed)
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_topic_matrix = lda_model.fit_transform(tf)

# Get topic names
topic_names = ["Topic " + str(i) for i in range(lda_model.n_components)]

# Assign dominant topic to each article
data['Dominant_Topic'] = [topic_names[i] for i in lda_topic_matrix.argmax(axis=1)]

# Visualize topics
topic_distribution = data['Dominant_Topic'].value_counts().reset_index()
# Visualize topics
fig = px.bar(
    data,  # Use the entire DataFrame with 'Dominant_Topic' column
    x=data.index,  # Use the DataFrame's original index as x-axis
    y='Dominant_Topic',
    title='Topic Distribution'
)
fig.show()

ValueError: Value of 'x' is not the name of a column in 'data_frame'. Expected one of ['Dominant_Topic', 'count'] but received: index
 To use the index, pass it in directly as `df.index`.