# Task 1:

Load the packages and data

In [7]:
#import libraries
import numpy as np #used to quickly perform mathematical calculations on vectors
import pandas as pd #allows us to work with data using Pandas dataframes
import re #regular expressions -- used in this assignment to clean the text data
import sqlite3 #used to interact with the database
from collections import Counter #used to quickly count letters and words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering, KMeans #used to perform agglomerative and divisive clustering
from sklearn.metrics.pairwise import cosine_similarity #used to compute cosine similarities between documents

In [2]:
#open a connection to the database
conn = sqlite3.connect('Project 01 - Database.db')

sql = 'SELECT * FROM Article'
df = pd.read_sql_query(sql, conn, index_col='id')

#close database connection
conn.close()

# Task 2:

Preprocess (clean) each article’s text so
that it is suitable for analysis

In [3]:
#define a function that will clean the raw input text in preparation for analysis. Returns a tuple containing
#both the cleaned text and the total number of words in the cleaned text.
def get_clean_text(raw_text):
  #find any period-separated acronyms (e.g., 'U.S.A', 'L.A.', etc.)
  period_separated_acronyms = re.findall(r'(?:[A-Z]\.){2,}', raw_text)
  #remove periods from any period-separated acronyms
  for i in range(len(period_separated_acronyms)):
    acronym = period_separated_acronyms[i].replace('.', '')
    raw_text = raw_text.replace(period_separated_acronyms[i], acronym)
  #remove all numbers from the text using a regular expression
  text = re.sub(r'[0-9]', ' ', raw_text)
  #remove all underscores from the text
  text = re.sub(r'\_', ' ', text)
  #remove anything else in the text that isn't a word character or a space (e.g., punctuation, special symbols, etc.)
  text = re.sub(r'[^\w\s]', ' ', text)
  #remove any excess whitespace
  for _ in range(10):
    text = text.replace('  ', ' ')
  #remove any leading or trailing space characters
  text = text.strip()
  #split the text into a list of words
  words = text.split()
  #convert all non-acronyms to lowercase
  for i in range(len(words)): #for each index in the words collection
    word = words[i] #define the current word
    if len(word) > 1 and len(word) < 7: #if this word is two to six characters long
      if word.isupper() == False: #if at least one character in this word is not uppercase
        #this word is not an acronym because it is not all uppercase, so convert it to lowercase
        words[i] = word.lower()
    else: #this word is not an acronym because it consists of one letter or more than six letters, so convert it to lowercase
      words[i] = word.lower()
  #return the cleaned text and the number of words in the cleaned text
  return (' '.join(words), len(words))


In [4]:
#clean the raw text of each article and save the resulting cleaned text and total number of words for
#each article in new dataframe columns named 'clean_text' and 'total_words'.
df[['clean_text', 'total_words']] = [get_clean_text(article_raw_text) for article_raw_text in df.raw_text]

# Task 3:

Compute the term frequency - inverse document frequency (TF-IDF) for each word in the vocabulary.

Term Frequency (TF):
For each word w appearing in a document d, the TF for w in d is computed as: (F w,d/Nd)
, where F w,d is the number of times the word w appears in document d, and Nd is the total number of words in d.

Inverse Document Frequency (IDF):
For each word w in the vocabulary, the IDF for w is computed as: log(𝑁/𝑁𝑤)
, where N is the total number of documents in the corpus and Nw is the number of documents in the corpus that contain the word w.

scikit-learn's TfidfVectorizer is a useful package here.

In [5]:
#build the vocabulary of unique words and compute TF-IDF scores for each article
vectorizer = TfidfVectorizer(lowercase=False)
article_tfidf_scores = np.array(vectorizer.fit_transform(df.clean_text).todense())
vocabulary = vectorizer.vocabulary_

#add each article's vector of TF-IDF scores to the dataframe
df['tfidf_scores'] = [tfidf_scores for tfidf_scores in article_tfidf_scores]

# Task 4:

Use Hierarchial Agglomerative Clustering to group the articles into 5 clusters, one for each news subject

In [8]:
#use hierarchical agglomerative clustering to group the articles into three clusters,
#and add the resulting cluster assignments to the dataframe
model = AgglomerativeClustering(n_clusters=5)
cluster_ids = model.fit_predict(df.tfidf_scores.to_list())
df['agglomerative_cluster_id'] = [cluster_id for cluster_id in cluster_ids]

# Task 5:

Use Divisive Clustering to group the articles into 5 clusters, using K-means approach. 

In [9]:
#use k-means divisive clustering to group the articles into three clusters,
#and add the resulting cluster assignments to the dataframe
model = KMeans(n_clusters=5, random_state=321)
cluster_ids = model.fit_predict(df.tfidf_scores.to_list())
df['divisive_cluster_id'] = [cluster_id for cluster_id in cluster_ids]



In [10]:
#See results of the different approaches. An ensemble approach could be useful here with a tie-breaker algorithm added. 
#Note that cluster id's may not be identical as we didn't define what topic a cluster id belonged to. 
df.head()

Unnamed: 0_level_0,category,raw_text,clean_text,total_words,tfidf_scores,agglomerative_cluster_id,divisive_cluster_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6347,Politics,Hiding women away in the home hidden behind ve...,hiding women away in the home hidden behind ve...,454,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,2
13840,Sports,Celtic brushed aside Clyde to secure their pla...,celtic brushed aside clyde to secure their pla...,495,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3,1
14775,Unknown,"If you have finished Doom 3, Half Life 2 and H...",if you have finished doom half life and halo d...,499,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4,2
16641,Unknown,Controversial new UK casinos will be banned fr...,controversial new UK casinos will be banned fr...,274,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,2
17511,Unknown,Justine Henin-Hardenne lost to Elena Dementiev...,justine henin hardenne lost to elena dementiev...,318,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,1
