In [None]:
import requests
import urllib
from bs4 import BeautifulSoup
import pandas as pd
import time

## Collect Data

<p class="big">
<font size="4">
in this section we download article pages from the popular news site Ynet. <br><br>
We use BeautifulSoup to parse the html structure  <br><br>
Try it Yourself »
</font>
</p>


In [None]:
#extract text and images features from an article
def site_article_feature_extract(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content,"lxml")
    
    # kill all script elements
    for script in soup("script"):
        script.extract()
        
    #extract features from page
    page_content = soup.find_all('div', class_='block B3')[0]
    features = {}
    features['headline'] = page_content.find('div', class_='art_header_title').get_text()
    features['sub_headline'] = page_content.find('div', class_='art_header_sub_title').get_text()
    features['article_text'] = " ".join([text.get_text().strip() for text in page_content.find_all('p')])
    features['images'] = [x['src'] for x in page_content.find_all('img') if 'http://images' in x['src']]
    
    return features

In [None]:
#archive articles from Jan 2017
site_url = 'http://www.ynet.co.il/home/0,7340,L-4269-141-344-201701-1,00.html'
page =  requests.get(site_url)
soup = BeautifulSoup(page.content,"lxml")

In [None]:
archive_page = soup.find_all('a', class_='smallheader') # articles in this site archive are called "smallheader"...
article_urls = [urllib.parse.urljoin("http://www.ynet.co.il", article.get('href')) for article in archive_page]

In [None]:
#scrapping sites one by one and extracting features
df_articles = pd.DataFrame(columns=['headline', 'sub_headline', 'article_text', 'images'])
i=0
for url in article_urls:
    i+=1
    print('\r extracting: {} , {} out of {}'.format(url, i, len(article_urls))),
    time.sleep(1)
    try:
        df_articles.loc[len(df_articles)] = site_article_feature_extract(url)
    except:
        pass 

In [None]:
df_articles.head()

In [None]:
df_articles.to_csv('./df_articles.csv', index=False, encoding='utf-8')

## Preprocess data

After downloading raw data we need to start preprocessing it <br>
When it comes to text analysis we usually "clean" the text from any punctuations, white spaces and other symbols, leaving only words

In [None]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np

In [None]:
df_articles = pd.read_csv('./df_articles.csv' ,encoding='utf-8')

In [None]:
#concat all text fields into one
df_articles['all_text'] = 

In [None]:
#clean the text, leave only alpha, num and space
pattern = re.compile(u"[?????]")
df_articles['all_text'] = df_articles.apply(lambda row: re.sub(pattern, "", row['all_text']), axis=1)

### explore the data

In [None]:
count_vectorizer = CountVectorizer()
#train this count vectorizer to give u the frequency of each word in our dataset
#save your results into a pd.DataFrame that contains 2 columns: [word, count]

In [None]:
df_words.sort_values(by='count', ascending=[False]).head()

In [None]:
#plot a histogram of the words frequencies 
df_words['count'].plot(kind=)
plt.show()

## Extract Features

Here we will fit our data with <i>tf-idf</i> transformation. <br>
Each document will be represented in a V dimension <br>
The numbers on each feature vector should represent the tf-idf score of each word in V for this sample (document) <br>
After extracting the <i>tf-idf</i> features, we can then use a dimensionality reduction technique called <b>PCA</b>

In [None]:
tf_idf = TfidfVectorizer(min_df=?, max_df=?)

In [None]:
tf_idf_vectors = 
svd = PCA(n_components=?, random_state=2017)
tf_idf_vectors = svd.fit_transform(tf_idf_vectors)
df_vectors = pd.DataFrame(tf_idf_vectors, columns=["tf_idf_" +str(x) for x in range(tf_idf_vectors.shape[1])])
df_vectors['headline'] = df_articles['headline']

## K-Means 

<font size=4> Finally!! </font> <br> <br>
After cleaning the text and extracting features we can go ahead and use K-Means to find clusters in our dataset

In [None]:
from sklearn.cluster import KMeans

In [None]:
features = df_vectors.columns.tolist()
features.remove('headline')

In [None]:
n_clusters = ?
kmeans = KMeans(?, random_state=2017)

In [None]:
#fit kmeans to our data, use predict method to get the cluster for each sample
kmeans.fit_transform(?)
clusters = ?

In [None]:
# Add the assigned cluster to the corresponding row of the original dataset
df_clusters = pd.DataFrame(columns=['headline', 'cluster'])
df_clusters['headline'] = df_vectors['headline']
df_clusters['cluster'] = clusters

In [None]:
# lets see what we got
df_clusters[df_clusters['cluster']==1]