In [None]:
#import libraries
import numpy as np #used to quickly perform mathematical calculations on vectors
import re #regular expressions; used to clean the text data
import sqlite3 #used to interact with the database
import pandas as pd #allows us to work with data using Pandas dataframes
from collections import Counter #used to quickly count letters and words

In [None]:
#open a connection to the database
conn = sqlite3.connect('Project 01 - Database.db')

#load all documents into a Pandas dataframe named 'df', and use the id column as the index
sql = 'SELECT * FROM Article'
df = pd.read_sql_query(sql, conn, index_col='id')

#close database connection
conn.close()

In [None]:
#define a function that will clean the raw input text in preparation for analysis
def clean_text(raw_text):
  #convert the raw text to lowercase
  text = raw_text.lower()
  #remove all numbers from the text using a regular expression
  text = re.sub(r'[0-9]', ' ', text)
  #remove all underscores from the text
  text = re.sub(r'\_', ' ', text)
  #remove anything else in the text that isn't a word character or a space (e.g., punctuation, special symbols, etc.)
  text = re.sub(r'[^\w\s]', ' ', text)
  #remove any excess whitespace
  for _ in range(10):
    text = text.replace('  ', ' ')
  #remove any leading or trailing space characters
  text = text.strip()
  #return the clean text
  return text


#clean the raw text of each article, and store the resulting clean text in a new column 
#in each dataframe. The code below uses a Python feature known as 'list comprehension'
#to quickly handle this task.
df['clean_text'] = [clean_text(raw_text) for raw_text in df.raw_text]


#show the cleaned text of the first English-language document
df.iloc[0]['clean_text']

'hiding women away in the home hidden behind veils is a backward view of islam president musharraf of pakistan has said during a visit to britain he was speaking to the bbc s newsnight programme a few hours before visiting the pakistani community in manchester my wife is travelling around she is very religious but she is very moderate said general musharraf it comes after pakistan s high commissioner to britain said some pakistanis should integrate more dr maleeha lodhi said people could not expect others to listen to their grievances if they isolated themselves gen musharraf told the bbc some people think that the women should be confined to their houses and put veils on and all that and they should not move out absolutely wrong the pakistani president was also asked whether he thought the war on terror had made the world less safe yes absolutely and i would add that unfortunately we are not addressing the core problems so therefore we can never address it in its totality he said we a

In [None]:
#build a vocabulary of words
all_text = ' '.join(df.clean_text) #join all of the English texts into one big string
words = all_text.split() #split the text into words
word_frequencies = Counter(words) #count all words in the text
vocabulary = list(word_frequencies.keys()) #get a list of all unique words

In [None]:
#display the total number of unique words in the vocabulary
len(vocabulary)

27762

In [None]:
#define a class that we can use to hold information about each document
class Document:
  def __init__(self, id, category, word_frequencies, total_words):
    self.id = id #the document's unique ID number
    self.category = category #the document's topic
    self.predicted_category = None
    self.total_words = total_words #the total number of words in the document
    self.word_frequencies = word_frequencies #holds raw frequencies for each word in the vocabulary
    self.term_frequencies= None
    self.tfidf_scores = None

In [None]:
#sort the vocabulary to ensure that we all get consistent results!
vocabulary.sort()
#define a collection (list) to hold our Document objects
documents = []
#create a Document object for each document in the English-language corpus
for row in df.itertuples(): #for each row in the English-language dataframe
  words = row.clean_text.split() #split the (clean) text into words
  document_word_frequencies = Counter(words) #count all words in the document's (clean) text
  total_words = sum(document_word_frequencies.values()) #compute the total number of words in the document
  #compute the document's raw word frequencies for every word in the VOCABULARY (as opposed to every
  #word in the document). The vocabulary will contain more unique words than the document itself, but 
  #we still need to consider EVERY word in the vocabulary, even if a particular word in the vocabulary 
  #doesn't appear in the document. This will ensure that the feature vectors for all of the documents 
  #are all exactly the same length and have exactly corresponding elements!
  vocabulary_word_frequencies = []
  for vocabulary_word in vocabulary:
    #if this vocabulary word exists in the document
    if vocabulary_word in document_word_frequencies:
      #add the raw document frequency for this vocabulary word to the collection
      vocabulary_word_frequencies.append(document_word_frequencies[vocabulary_word])
    else: #if this vocabulary word doesn't exist in the document
      #add a value of zero for this vocabulary word to the collection (since this
      #vocabulary word doesn't exist in the current document)
      vocabulary_word_frequencies.append(0)      
  #add a new Document object for this document to the collection
  documents.append(Document(row.Index, row.category, vocabulary_word_frequencies, total_words))

In [None]:
#for each document in the 'documents' collection
for document in documents:
  #compute the unigram probability distributions for this document
  #document.term_frequencies = np.array(document.word_frequencies) / document.total_words
  document.term_frequencies = np.array(document.word_frequencies) / np.sqrt(document.total_words)

  """


In [None]:
#calc idf scores for each vocab word
idf_scores = []
for index in range(len(vocabulary)):
  number_of_documents_containing_word = 0
  for d in documents:
    if d.word_frequencies[index] > 0:
      number_of_documents_containing_word += 1
  idf = np.log(len(documents)/number_of_documents_containing_word)
  idf_scores.append(idf)

In [None]:
idf_scores = np.array(idf_scores)
for d in documents:
  d.tfidf_scores = np.array(d.term_frequencies) * idf_scores

In [None]:
#define a dictionary that holds each topic's name (keys) and average word probability distribution (values).
#The probability distributions are all numpy arrays of the same size as the vocabulary. All elements of each
#probability distribution are initialized to zero.
category_tfidf_scores = {'Business': np.zeros(len(vocabulary)), 'Politics': np.zeros(len(vocabulary)), 'Sports': np.zeros(len(vocabulary)), 'Technology': np.zeros(len(vocabulary)), 'Entertainment': np.zeros(len(vocabulary))}
#define a dictionary to hold the number of documents for each topic
document_counts = {'Business': 0, 'Politics': 0, 'Sports': 0, 'Technology': 0, 'Entertainment': 0}
#for each document in the corpus
for d in documents:
  #if the topic of this document is known
  if d.category != 'Unknown':
    #increment the document count for this topic
    document_counts[d.category] += 1
    #add this document's word probabilities to the running sum for the corresponding distribution 
    #for the document's topic
    category_tfidf_scores[d.category] += d.tfidf_scores
#compute the average word probability distributions for each topic by dividing the summed probabilities
#by the number of documents for each topic
for category in category_tfidf_scores:
  category_tfidf_scores[category] /= document_counts[category]

In [None]:
#define a function to compute the Euclidean distance between two points 
#(where each point is defined as a vector)
def get_distance(point1, point2):
  return np.sqrt(np.sum(np.square(point1 - point2)))

In [None]:
#for each article
number_of_accurate_predictions = 0
number_of_known_articles = 0
for d in documents:
  min_distance = np.inf
  best_topic = None
  for category in category_tfidf_scores:
    distance = get_distance(d.tfidf_scores, category_tfidf_scores[category])
    if distance < min_distance:
        min_distance = distance
        best_category = category
  d.predicted_category = best_category
  if d.category != 'Unknown':
    number_of_known_articles += 1
    if d.category == d.predicted_category:
      number_of_accurate_predictions += 1

print(number_of_accurate_predictions/number_of_known_articles)

  after removing the cwd from sys.path.


0.1836734693877551


In [None]:
with open('Connolly, Sean.csv', 'w') as csvfile:
 for d in documents:
   if d.category == 'Unknown':
    csvfile.write('{},{}\n'.format(d.id, d.predicted_category))