In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime as dt

In [2]:
# used articles
articles = [
    'https://kun.uz/uz/news/2022/11/30/parda-ortidagi-xususiylashtirish-yirik-aktivlar-qanday-qilib-ofshor-kompaniyalarga-otib-ketmoqda',
    'https://kun.uz/uz/news/2022/12/02/haqiqatda-zarur-bolganlarga-gamxorlik-korsating',
    'https://qalampir.uz/uz/news/sizlar-kabi-men-%D2%B3am-aldandim-zha%D2%B3ongir-khuzhayev-teleuyindan-uzi-%D2%B3am-pul-ololmaganini-aytdi-72949'
]

# Task 1

In [3]:
# Parsing
data = []

for article in articles:
  html = requests.get(article).text
  soup = BeautifulSoup(html, 'html.parser')
  all_p_tags = soup.find_all('p')
  text = ' '.join(
      list(map(lambda p: p.get_text(), all_p_tags))
  )
  data.append(
      {
          'source_url': article,
          'access_datetime': dt.now(),
          'content': text
      }
  )

In [4]:
# createing dataframe
df = pd.DataFrame(data)

In [5]:
df

Unnamed: 0,source_url,access_datetime,content
0,https://kun.uz/uz/news/2022/11/30/parda-ortida...,2022-12-02 09:20:00.363609,Oxirgi yillarda davlat korxonalarini xususiyla...
1,https://kun.uz/uz/news/2022/12/02/haqiqatda-za...,2022-12-02 09:20:01.512848,3 dekabr - Xalqaro nogironlar kuni sanasi ERIE...
2,https://qalampir.uz/uz/news/sizlar-kabi-men-%D...,2022-12-02 09:20:03.216314,"Kecha, 28 noyabr kuni “Omadingizni bersin” tel..."


# Task 2

In [6]:
content = df['content'].copy()

# Splitting words from text and removing punctuation symbols
def remove_symbols(text):
  return ''.join(
      list(map(lambda character: ' ' if character in '.,:;!?"”“%$()/\|\n\t' else character, text))
  )

df['word'] = content.apply(str.lower).apply(remove_symbols)
df

Unnamed: 0,source_url,access_datetime,content,word
0,https://kun.uz/uz/news/2022/11/30/parda-ortida...,2022-12-02 09:20:00.363609,Oxirgi yillarda davlat korxonalarini xususiyla...,oxirgi yillarda davlat korxonalarini xususiyla...
1,https://kun.uz/uz/news/2022/12/02/haqiqatda-za...,2022-12-02 09:20:01.512848,3 dekabr - Xalqaro nogironlar kuni sanasi ERIE...,3 dekabr - xalqaro nogironlar kuni sanasi erie...
2,https://qalampir.uz/uz/news/sizlar-kabi-men-%D...,2022-12-02 09:20:03.216314,"Kecha, 28 noyabr kuni “Omadingizni bersin” tel...",kecha 28 noyabr kuni omadingizni bersin tel...


In [7]:
# Saving data
df.to_csv('dataset.csv', index=False)

# Task 3

In [8]:
# Counting word occurrences in all articles

# Joining all articles
full_text = ' '.join(df['word'])

# Splitting words
words = full_text.split(' ')

# Removing duplicates
words = set(words)
words.remove('')
words = list(words)

In [9]:
# Counting and saving into csv
word_count = pd.DataFrame(columns=['word', 'count'])

for word in words:
  word_count.loc[len(word_count)] = [word, full_text.count(word)]

word_count.sort_values('count', ascending=False).to_csv('words_count.csv', index=False)