In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
import numpy as np

# Web Scrapping

In [2]:
df = pd.read_csv('/content/cnn.csv')

In [3]:
def scrap_cnn(topic):
    base_url = f'https://edition.cnn.com/{topic}'
    response = requests.get(base_url)

    soup = BeautifulSoup(response.text, 'lxml')
    if (topic=='sport' or topic=='entertainment'):
        url_tags = soup.find_all('a', class_="container__link container__link--type-article container_lead-plus-headlines__link container_lead-plus-headlines__left container_lead-plus-headlines__light")
    else:
        url_tags = soup.find_all('a', class_="container__link container__link--type-article container_lead-plus-headlines__link")
    urls = [urljoin(base_url, a.get('href')) for a in url_tags]
    content_list = []
    for url in urls:
#         print(url)
        response = requests.get(url)
        if response.status_code == 200:
            article_soup = BeautifulSoup(response.text, 'lxml')
            headline = article_soup.find('h1', class_="headline__text inline-placeholder")
            text_li = article_soup.find_all('p', class_='paragraph inline-placeholder')
            if headline:
                content_list.append(headline.text.strip())
                text = ' '.join([text_elem.get_text(strip=True) for text_elem in text_li])
                content_list.extend(text.split('\n'))


    return content_list

In [4]:
def scrap_timesnow(topic):
    base_url = f'https://www.timesnownews.com/{topic}'
    response = requests.get(base_url)

    soup = BeautifulSoup(response.text, 'lxml')

    url_tags = soup.find_all('a', class_="undefined")
    urls = [urljoin(base_url, a.get('href')) for a in url_tags]
    content_list = []
    for url in urls:
        # print(url)
        response = requests.get(url)
        if response.status_code == 200:
            article_soup = BeautifulSoup(response.text, 'lxml')
            headline = article_soup.find('h1', class_="_1Fcx")
            text_li = article_soup.find_all('div', class_='_1884')
            text = ' '.join([text_elem.get_text(strip=True) for text_elem in text_li])
            if headline:
                content_list.append(headline.text.strip())
                text = ' '.join([text_elem.get_text(strip=True) for text_elem in text_li])
                content_list.extend(text.split('\n'))



    return content_list

In [5]:
topics = ['world','entertainment','business','sport','sports','business-economy','entertainment-news']

In [6]:
# for topic in topics:
#     scrap_timesnow(topic)


In [7]:
# for topic in topics:
#     headlines = scrap_cnn(topic)
#     headlines_list.extend(headlines)
#     print(len(headlines))

In [8]:
headlines_list = {}
for topic in topics:
    headlines = scrap_cnn(topic)
    headlines.extend(scrap_timesnow(topic))
    headlines_list[topic] = headlines

In [9]:
# for key in headlines_list.keys():
#     print("len of ",key,len(headlines_list[key]))

In [10]:
# len(headlines_list)

In [11]:
# headlines_list

In [12]:
df = pd.DataFrame(columns=['News','Topic'])

In [13]:
for key,value in headlines_list.items():
    df = df.append(pd.DataFrame({'News':value,'Topic':[key]*len(value)}),ignore_index=True)

  df = df.append(pd.DataFrame({'News':value,'Topic':[key]*len(value)}),ignore_index=True)
  df = df.append(pd.DataFrame({'News':value,'Topic':[key]*len(value)}),ignore_index=True)
  df = df.append(pd.DataFrame({'News':value,'Topic':[key]*len(value)}),ignore_index=True)
  df = df.append(pd.DataFrame({'News':value,'Topic':[key]*len(value)}),ignore_index=True)
  df = df.append(pd.DataFrame({'News':value,'Topic':[key]*len(value)}),ignore_index=True)
  df = df.append(pd.DataFrame({'News':value,'Topic':[key]*len(value)}),ignore_index=True)
  df = df.append(pd.DataFrame({'News':value,'Topic':[key]*len(value)}),ignore_index=True)


In [14]:
df

Unnamed: 0,News,Topic
0,‘Battle against time’ to find quake survivors ...,world
1,Scenes of devastation emerged along Japan’s we...,world
2,‘Battle against time’ to find quake survivors ...,world
3,Scenes of devastation emerged along Japan’s we...,world
4,South Korean opposition leader conscious after...,world
...,...,...
759,"Dharmatic Entertainment, which produces OTT c...",entertainment-news
760,​A Ranjith Cinema: A Failed Experiment With Truth,entertainment-news
761,"The title is a bit of a misnomer. However,A R...",entertainment-news
762,The Crown's Luther Ford Reveals Who Pushed Him...,entertainment-news


In [15]:
df['Topic'].isnull().sum()

0

In [16]:
df.to_csv('cnn.csv',index=False)


# Data Cleaning & Pre Processing

In [17]:
df.isnull().sum()

News     0
Topic    0
dtype: int64

In [18]:
 df.duplicated().sum()

152

In [19]:
df=df.drop_duplicates().reset_index(drop=True)

In [20]:
df

Unnamed: 0,News,Topic
0,‘Battle against time’ to find quake survivors ...,world
1,Scenes of devastation emerged along Japan’s we...,world
2,South Korean opposition leader conscious after...,world
3,South Korea’s main opposition party leader Lee...,world
4,Denmark’s Crown Princess Mary to become first ...,world
...,...,...
607,"Dharmatic Entertainment, which produces OTT c...",entertainment-news
608,​A Ranjith Cinema: A Failed Experiment With Truth,entertainment-news
609,"The title is a bit of a misnomer. However,A R...",entertainment-news
610,The Crown's Luther Ford Reveals Who Pushed Him...,entertainment-news


#### Replacing punctuations




In [21]:
df['news_processed'] = df['News'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)


In [22]:
df.isnull().sum()

News              0
Topic             0
news_processed    0
dtype: int64

#### Replacing small words

In [23]:
df['news_processed'] = df['news_processed'].apply(lambda row : ' '.join(word for word in row.split() if len(word)>2))

In [24]:
df.Topic.isnull().sum()

0

In [25]:
df['news_processed'] = [row.lower() for row in df['news_processed']]

In [26]:
df

Unnamed: 0,News,Topic,news_processed
0,‘Battle against time’ to find quake survivors ...,world,battle against time find quake survivors japan...
1,Scenes of devastation emerged along Japan’s we...,world,scenes devastation emerged along japans wester...
2,South Korean opposition leader conscious after...,world,south korean opposition leader conscious after...
3,South Korea’s main opposition party leader Lee...,world,south koreas main opposition party leader lee ...
4,Denmark’s Crown Princess Mary to become first ...,world,denmarks crown princess mary become first aust...
...,...,...,...
607,"Dharmatic Entertainment, which produces OTT c...",entertainment-news,dharmatic entertainment which produces ott con...
608,​A Ranjith Cinema: A Failed Experiment With Truth,entertainment-news,ranjith cinema failed experiment with truth
609,"The title is a bit of a misnomer. However,A R...",entertainment-news,the title bit misnomer howevera ranjith cinema...
610,The Crown's Luther Ford Reveals Who Pushed Him...,entertainment-news,the crowns luther ford reveals who pushed him ...


In [27]:
df.isnull().sum()

News              0
Topic             0
news_processed    0
dtype: int64

In [28]:
df['Topic'] = df['Topic'].replace({'world':'world','sports':'sport','business-economy':'business','entertainment-news':'entertainment'})

In [29]:
df['Topic'].isnull().sum()

0

#### Stop word removal

In [30]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import word_tokenize
stop_words = set(stopwords.words('english'))
new_stopwords = set(['say', 'also', 'make', "one", 'take', 'get', 'like', 'go'])
stop_words = stop_words.union(new_stopwords)
def remove_stopwords(news):
    review_tokenized = word_tokenize(news)
    new_news = " ".join([i for i in review_tokenized if i.lower() not in stop_words])
    return new_news
df['news_processed'] = [remove_stopwords(r) for r in df['news_processed']]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
if 'say' in stop_words:
  print('yes')

yes


#### Lemmatisation

In [32]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [33]:
def nltk_to_wordnet(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [34]:
def lemmatize(sentence):
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    wordnet_tag = map(lambda x : (x[0],nltk_to_wordnet(x[1])),nltk_tagged)
    lem_sentence = []
    for word,tag in wordnet_tag:
        if tag is None:
            lem_sentence.append(word)
        else:
            lem_sentence.append(lemmatizer.lemmatize(word,tag))
    return ' '.join(lem_sentence)

In [35]:
df['news_processed'] = df['news_processed'].apply(lemmatize)

In [36]:
df

Unnamed: 0,News,Topic,news_processed
0,‘Battle against time’ to find quake survivors ...,world,battle time find quake survivor japan lift tsu...
1,Scenes of devastation emerged along Japan’s we...,world,scene devastation emerge along japans western ...
2,South Korean opposition leader conscious after...,world,south korean opposition leader conscious stab ...
3,South Korea’s main opposition party leader Lee...,world,south korea main opposition party leader lee j...
4,Denmark’s Crown Princess Mary to become first ...,world,denmark crown princess mary become first austr...
...,...,...,...
607,"Dharmatic Entertainment, which produces OTT c...",entertainment,dharmatic entertainment produce ott content ka...
608,​A Ranjith Cinema: A Failed Experiment With Truth,entertainment,ranjith cinema fail experiment truth
609,"The title is a bit of a misnomer. However,A R...",entertainment,title bit misnomer howevera ranjith cinemasic ...
610,The Crown's Luther Ford Reveals Who Pushed Him...,entertainment,crown luther ford reveals push audition prince...


In [37]:
import matplotlib.pyplot as plt
from nltk import FreqDist
%matplotlib inline

In [38]:
topics = ['world', 'entertainment', 'business', 'sport']

In [39]:
# df[df['Topic']==topic]['news_processed']

#### Word Embedding

In [40]:
# pip install gensim
from gensim.models import Word2Vec

In [41]:
tokenized_data = [word_tokenize(sentence) for sentence in df['news_processed']]

In [42]:
len(tokenized_data)

612

In [43]:
model = Word2Vec(sentences=tokenized_data, vector_size=200, window=5, sg=0, min_count=1)

In [44]:
numerical_data = []
for sentence in tokenized_data:
    sentence_vectors = [model.wv[word] for word in sentence if word in model.wv]
    numerical_data.append(sentence_vectors)

In [45]:
df['word2vec_representation'] = numerical_data


In [46]:
df['Topic'].value_counts()

entertainment    170
world            164
sport            140
business         138
Name: Topic, dtype: int64

In [47]:
vectors = df['word2vec_representation'].values

In [48]:
df['word2vec_representation'] = [np.mean(article, axis=0) for article in vectors]

In [49]:
# for i in range(len(df)):
#     if type(df['word2vec_representation'][i]) == type(np.array([])):
#         df['word2vec_representation'][i] = np.mean(df['word2vec_representation'][i], axis=0)

In [50]:
# def vectorize(list_of_docs, model):
#     """Generate vectors for list of documents using a Word Embedding

#     Args:
#         list_of_docs: List of documents
#         model: Gensim's Word Embedding

#     Returns:
#         List of document vectors
#     """
#     features = []

#     for tokens in list_of_docs:
#         zero_vector = np.zeros(model.vector_size)
#         vectors = []

#         if vectors:
#             vectors = np.asarray(vectors)
#             avg_vec = vectors.mean(axis=0)
#             features.append(avg_vec)
#         else:
#             features.append(zero_vector)
#     return features

# vectorized_docs = vectorize(vectors, model=model)
# len(vectorized_docs), len(vectorized_docs[0])

In [51]:
model.wv.most_similar("israel")

[('accord', 0.9998918175697327),
 ('include', 0.9998874664306641),
 ('say', 0.9998814463615417),
 ('city', 0.9998807311058044),
 ('time', 0.9998759627342224),
 ('two', 0.9998754858970642),
 ('report', 0.9998742341995239),
 ('see', 0.9998679161071777),
 ('share', 0.9998667240142822),
 ('come', 0.9998653531074524)]

In [52]:
df

Unnamed: 0,News,Topic,news_processed,word2vec_representation
0,‘Battle against time’ to find quake survivors ...,world,battle time find quake survivor japan lift tsu...,"[0.09868876, 0.0031383287, 0.09891381, 0.18133..."
1,Scenes of devastation emerged along Japan’s we...,world,scene devastation emerge along japans western ...,"[0.10078444, 0.0034007884, 0.10172553, 0.18559..."
2,South Korean opposition leader conscious after...,world,south korean opposition leader conscious stab ...,"[0.08801686, 0.0012792752, 0.08629523, 0.15506..."
3,South Korea’s main opposition party leader Lee...,world,south korea main opposition party leader lee j...,"[0.09002291, 0.0021016654, 0.09114499, 0.16483..."
4,Denmark’s Crown Princess Mary to become first ...,world,denmark crown princess mary become first austr...,"[0.11326799, 0.002031833, 0.111025736, 0.20036..."
...,...,...,...,...
607,"Dharmatic Entertainment, which produces OTT c...",entertainment,dharmatic entertainment produce ott content ka...,"[0.08639129, 0.0023234866, 0.08631297, 0.15685..."
608,​A Ranjith Cinema: A Failed Experiment With Truth,entertainment,ranjith cinema fail experiment truth,"[0.025679344, 0.0017707869, 0.024864566, 0.050..."
609,"The title is a bit of a misnomer. However,A R...",entertainment,title bit misnomer howevera ranjith cinemasic ...,"[0.07250749, 0.0013814443, 0.07213596, 0.13158..."
610,The Crown's Luther Ford Reveals Who Pushed Him...,entertainment,crown luther ford reveals push audition prince...,"[0.050335854, 0.0019363868, 0.050815266, 0.092..."


In [53]:
# max_length = max(len(vec) for vec in df['word2vec_representation'])
# padded_vectors = [np.pad(vec, ((0, max_length - len(vec)), (0, 0)), mode='constant') for vec in df['word2vec_representation']]


In [54]:
df.dropna(subset=['word2vec_representation'] ,inplace=True)

In [55]:
df['Topic'] = df['Topic'].map({'world':0, 'entertainment':1, 'business':2, 'sport':3})

#### Splitting

In [56]:
from sklearn.model_selection import train_test_split
X = df['word2vec_representation']
y = df['Topic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [57]:
X_train

561    [0.11677571, 0.003136852, 0.11711702, 0.214614...
287    [0.123131245, 0.004002858, 0.12259665, 0.22354...
322    [0.12116873, 0.0017582187, 0.12001245, 0.21914...
132    [0.15646113, 0.005034458, 0.15924123, 0.288813...
174    [0.040207945, 0.0010403497, 0.04087956, 0.0718...
                             ...                        
71     [0.074521944, 0.0021130561, 0.07481841, 0.1357...
106    [0.070391774, 0.00096784777, 0.07157569, 0.130...
270    [0.07667209, 0.0043309415, 0.07758028, 0.14200...
435    [0.07184857, 0.002300909, 0.072740786, 0.13095...
102    [0.08282236, 0.0024875381, 0.08316349, 0.15038...
Name: word2vec_representation, Length: 489, dtype: object

In [58]:
X_train = np.array([np.array(vec_list) for vec_list in X_train])

In [59]:
X_test = np.array([np.array(vec_list) for vec_list in X_test])

In [60]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Assuming X contains your data
# Scale the data (optional, but recommended for PCA)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)



In [61]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

for depth in [1,2,3,4,5,6,7,8,9,10]:
  rf= RandomForestClassifier(max_depth=depth,n_estimators=100,max_features="sqrt")
  rf.fit(X_train, y_train)
  trainAccuracy = rf.score(X_train,y_train)
  valAccuracy = cross_val_score(rf, X_train, y_train, cv=10)
  print("Depth  : ", depth, " Training Accuracy : ", trainAccuracy, " Cross val score : " ,np.mean(valAccuracy))

Depth  :  1  Training Accuracy :  0.3783231083844581  Cross val score :  0.333375850340136
Depth  :  2  Training Accuracy :  0.47443762781186094  Cross val score :  0.3722789115646258
Depth  :  3  Training Accuracy :  0.5439672801635992  Cross val score :  0.3823554421768708
Depth  :  4  Training Accuracy :  0.656441717791411  Cross val score :  0.4008078231292517
Depth  :  5  Training Accuracy :  0.7995910020449898  Cross val score :  0.4129251700680272
Depth  :  6  Training Accuracy :  0.8895705521472392  Cross val score :  0.42721088435374144
Depth  :  7  Training Accuracy :  0.950920245398773  Cross val score :  0.46632653061224494
Depth  :  8  Training Accuracy :  0.9815950920245399  Cross val score :  0.4395833333333333
Depth  :  9  Training Accuracy :  0.9856850715746421  Cross val score :  0.45816326530612245
Depth  :  10  Training Accuracy :  0.9938650306748467  Cross val score :  0.45612244897959187


In [None]:
rf= RandomForestClassifier(max_depth=7,n_estimators=100,max_features="sqrt")
rf.fit(X_train, y_train)

rf.score(X_test,y_test)

In [None]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score
import numpy as np
for lr in [0.01,0.02,0.03,0.04,0.05,0.1,0.11,0.12,0.13,0.14,0.15,0.2,0.5,0.7,1]:
  model = xgb.XGBClassifier(learning_rate = lr, n_estimators=100, verbosity = 0)
  model.fit(X_train, y_train)
  print("Learning rate : ", lr," Train score : ", model.score(X_train,y_train)," Cross-Val score : ", np.mean(cross_val_score(model, X_train, y_train, cv=10)))


Learning rate :  0.01  Train score :  0.9693251533742331  Cross-Val score :  0.4232568027210884
Learning rate :  0.02  Train score :  0.9877300613496932  Cross-Val score :  0.4642006802721088
Learning rate :  0.03  Train score :  0.9938650306748467  Cross-Val score :  0.48052721088435363
Learning rate :  0.04  Train score :  0.9938650306748467  Cross-Val score :  0.4988520408163266
Learning rate :  0.05  Train score :  0.9959100204498977  Cross-Val score :  0.5009353741496598
Learning rate :  0.1  Train score :  0.9959100204498977  Cross-Val score :  0.5295493197278912
Learning rate :  0.11  Train score :  0.9959100204498977  Cross-Val score :  0.5029761904761905
