In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.sentiment import SentimentIntensityAnalyzer
import spacy


In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\agnih\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\agnih\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\agnih\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\agnih\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 653.6 kB/s eta 0:00:20
     ---------------------------------------- 0.1/12.8 MB 1.2 MB/s eta 0:00:11
      --------------------------------------- 0.2/12.8 MB 1.7 MB/s eta 0:00:08
     - -------------------------------------- 0.4/12.8 MB 2.3 MB/s eta 0:00:06
     -- ------------------------------------- 0.7/12.8 MB 3.0 MB/s eta 0:00:04
     --- ------------------------------------ 1.1/12.8 MB 4.2 MB/s eta 0:00:03
     ------ --------------------------------- 2.1/12.8 MB 6.6 MB/s eta 0:00:02
     -------- ------------------------------- 2.7/12.8 MB 7.5 MB/s eta 0:00:02
     ---------- ----------------------------- 3.5/12.8 MB 8.9 MB/s eta 0:00:02
     ------------- ---------------------

In [5]:
nlp=spacy.load('en_core_web_sm')

In [6]:
df=pd.read_excel('assignment.xlsx')

In [7]:
df

Unnamed: 0,Article
0,"Retailers, the makers of foods marketed for we..."
1,"Move over, Ozempic — there’s a new drug in tow..."
2,Sept 14 (Reuters) - Bristol Myers Squibb (BMY....
3,Austin Wolcott was 18 years old and pretty sur...
4,"Cancer, often referred to as the “emperor of a..."
5,Nov 28 (Reuters) - The U.S. Food and Drug Admi...
6,Nov 21 (Reuters) - BeiGene (6160.HK) said on T...
7,Sept 19 (Reuters) - Drugmaker BeiGene (6160.HK...
8,BRUKINSA is the first and only BTK inhibitor a...
9,Whether you're looking for a quick bite to eat...


In [8]:
def clean_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    cleaned_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalnum() and token.lower() not in stop_words]
    return " ".join(cleaned_tokens)


In [9]:
# Summarization Module
def extractive_summarization(text):
    sentences = sent_tokenize(text)
    word_frequencies = FreqDist()
    for sentence in sentences:
        for word in word_tokenize(sentence):
            word_frequencies[word.lower()] += 1
    max_frequency = max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word] / max_frequency)
    sentence_scores = {}
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word in word_frequencies.keys():
                if len(sentence.split(' ')) < 30:
                    if sentence not in sentence_scores.keys():
                        sentence_scores[sentence] = word_frequencies[word]
                    else:
                        sentence_scores[sentence] += word_frequencies[word]
    summarized_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:5]
    return ' '.join(summarized_sentences)


In [10]:
# Mood Check Module
def get_mood(text):
    sia = SentimentIntensityAnalyzer()
    sentiment_score = sia.polarity_scores(text)
    if sentiment_score['compound'] >= 0.05:
        return "Positive"
    elif sentiment_score['compound'] <= -0.05:
        return "Negative"
    else:
        return "Neutral"

In [12]:
# Process articles and store results in a DataFrame
results = []
for index, row in df.iterrows():
    article_text = row['Article']
    cleaned_text = clean_text(article_text)
    summary = extractive_summarization(article_text)
    mood = get_mood(article_text)
    results.append({'Article': article_text, 'Cleaned Text': cleaned_text, 'Summary': summary, 'Mood': mood})

# Create DataFrame from results
results_df = pd.DataFrame(results)

# Display DataFrame
print(results_df)

                                              Article  \
0   Retailers, the makers of foods marketed for we...   
1   Move over, Ozempic — there’s a new drug in tow...   
2   Sept 14 (Reuters) - Bristol Myers Squibb (BMY....   
3   Austin Wolcott was 18 years old and pretty sur...   
4   Cancer, often referred to as the “emperor of a...   
5   Nov 28 (Reuters) - The U.S. Food and Drug Admi...   
6   Nov 21 (Reuters) - BeiGene (6160.HK) said on T...   
7   Sept 19 (Reuters) - Drugmaker BeiGene (6160.HK...   
8   BRUKINSA is the first and only BTK inhibitor a...   
9   Whether you're looking for a quick bite to eat...   
10  A federal judge in New York has dismissed a la...   
11  The future of fast food delivery is here.\n\nD...   
12  Yum Brands topped Wall Street estimates for th...   
13  If you fancy Taco Bell's Nacho Fries, the fast...   
14  Taco Bell is serving up its new Toasted Breakf...   
15  Oct 30 (Reuters) - McDonald's (MCD.N) beat Wal...   
16  Whether you dip it, drizzle

In [13]:
results_df.to_csv('article_analysis_results.csv', index=False)

In [15]:
df1=pd.read_csv('article_analysis_results.csv')

In [18]:
df1

Unnamed: 0,Article,Cleaned Text,Summary,Mood
0,"Retailers, the makers of foods marketed for we...",retailer maker food marketed weight loss type ...,"“The market potential is very, very significan...",Positive
1,"Move over, Ozempic — there’s a new drug in tow...",move ozempic new drug town eli lilly zepbound ...,Side effects of the weekly injectable include ...,Negative
2,Sept 14 (Reuters) - Bristol Myers Squibb (BMY....,sept 14 reuters bristol myers squibb said thur...,The New York-based company has been pressured ...,Negative
3,Austin Wolcott was 18 years old and pretty sur...,austin wolcott 18 year old pretty sure survive...,“You read how the drug extended the life of si...,Negative
4,"Cancer, often referred to as the “emperor of a...",cancer often referred emperor malady unyieldin...,"This approach, while potentially effective, in...",Negative
5,Nov 28 (Reuters) - The U.S. Food and Drug Admi...,nov 28 reuters food drug administration fda sa...,J&J unit Janssen and Legend Biotech's (LEGN.O)...,Negative
6,Nov 21 (Reuters) - BeiGene (6160.HK) said on T...,nov 21 reuters beigene said tuesday entered ag...,,Positive
7,Sept 19 (Reuters) - Drugmaker BeiGene (6160.HK...,sept 19 reuters drugmaker beigene said tuesday...,The drug is currently under review by the U.S....,Negative
8,BRUKINSA is the first and only BTK inhibitor a...,brukinsa first btk inhibitor approved follicul...,"Because new BTK is continuously synthesized, B...",Positive
9,Whether you're looking for a quick bite to eat...,whether looking quick bite eat experience quic...,"However, some locations may have reduced hours...",Positive
