## This file contains functions used in the Naive Bayes Classifier code

Created by Patrick Steeves for Independent Study with Professor Kanungo <br>
George Washington University, 12/23/2017

In [None]:
import urllib.request
import zipfile
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import time
import re
import numpy as np
from collections import Counter
import math

Import data from GitHub

In [None]:
def importData():
    # Pull data from repository
    url = "https://github.com/psteeves/NLP-projects/raw/master/Naive%20Bayes%20Topic%20Classifier/Data/"
    urllib.request.urlretrieve(url+'news1.zip', filename='news1.zip')
    urllib.request.urlretrieve(url+'news2.zip', filename='news2.zip')

    # Unzip CSV files
    zip_ref = zipfile.ZipFile('news1.zip', 'r')
    zip_ref.extractall()
    zip_ref.close()

    zip_ref = zipfile.ZipFile('news2.zip', 'r')
    zip_ref.extractall()
    zip_ref.close()
    
    news1 = pd.read_csv('news1.csv', encoding = 'latin1')
    news2 = pd.read_csv('news2.csv', encoding = 'latin1')

    return pd.concat([news1, news2]).reset_index(drop=True)

<br><br>Clean titles by stemming and removing stopwords

In [None]:
def cleanWords(df, title_col):

    start = time.time()   # Time how long cleaning takes
    stopped_words = []   # List of titles converted to stopped words

    print("Started cleaning headlines...")
    stemmer = PorterStemmer()
    
    checkpoint = time.time()
    for idx, row in df[title_col].iteritems():
        cleaned_title = re.sub('[^a-zA-Z]+',' ', row).lower()    # Only keep alphabetical characters
        words = [stemmer.stem(word) for word in cleaned_title.split() if word not in stopwords.words('english')]   # Stem and filter stopwords
        stopped_words.append(','.join(words))   # Append cleaned words to list of cleaned titles
        
        if time.time() - checkpoint > 600:   # Update user on progress every 10min
            print("Done cleaning {:2.1f}% of headlines".format(100*idx/len(df)))
            checkpoint = time.time()

    headlines['STOPPED_WORDS'] = stopped_words   # Add cleaned column to dataframe

    print("Took {:4.1f} minutes to clean titles".format((time.time()-start)/60))
    
    return headlines

<br><br>Classifier trained on given series of titles and of topics

In [None]:
class NBClassifier:
    """
    Naive Bayes Classifiertrained to perform topic classification
    """
    def __init__(self, titles, categories, train_split = 1):
        self.data = pd.concat([titles,categories], axis=1)              # Dataframe of cleaned titles and cats
        train_idx = np.random.rand(len(self.data)) < train_split        # Training data rows
        self.train_data = self.data.loc[train_idx,:].copy()
        self.test_data = self.data.loc[~train_idx,:].copy()
        
        all_words = []       # All words in titles, including duplicates
        for row in titles:
            all_words += row.split(',')

        self.total_words = len(all_words)
        self.word_count = dict(Counter(all_words))
        self.common_words = {w:c for w,c in self.word_count.items() if c > 5}  # Only keep words that appear at least 10 times
        self.unique_words = self.common_words.keys()
        self.categories = set(categories)
        self.pdf = {}               # Word counts over all categories, to be trained later
        self.words_per_cat = {}     # Number of words per category
        
        self.train_accuracy = None
        self.test_accuracy = None
        self.trained = False        # Indicator if classifier has already been trained
        self.misclassified = None   # Misclassified titles from test set, or from training test if no train/test split

        
    def trainPDF(self):
        """
        Update word count and total number of words in each category
        """
        i = 1
        for cat in self.categories:
            print("Creating PDf for topic {}/{}".format(i,len(self.categories)))
            relevant_cat = self.train_data.loc[lambda df: df.iloc[:,1] == cat,:]
            self.words_per_cat[cat] = 0
            self.pdf[cat]={}
            for row in relevant_cat.iloc[:,0]:
                title_words = row.split(',')
                self.words_per_cat[cat] += len(title_words)     # Iteratively number of words
                for word in title_words:
                    if self.pdf[cat].get(word):
                        self.pdf[cat][word] += 1       # For every word in title, iteratively update word count for category
                    else:
                        self.pdf[cat][word] = 1        # If word has not been seen already in category, set count to 1
            i+=1
            
        self.trained = True


    def getProb(cat, word, laplace_smooth):
        """
        Compute probability of category conditioned on word, adding Laplacian smoothing
        """
    prob = self.pdf[cat].get(word)
    return ((0 if prob is None else prob) + laplace_smooth) / (self.words_per_cat[cat] + laplace_smooth*self.total_words)


    def predictCats(self, title, already_stopped = False):
        """
        Given a title, predict topic of article. Assumes that title is not already cleaned.
        Returns dictionary of category probabilities for title.
        """
        if already_stopped:   # If title fed is already stopped
            words = [word for word in title.split(',')]
        else:
            stemmer = PorterStemmer()
            cleaned_title = re.sub('[^a-zA-Z]+',' ', title).lower()
            words = [stemmer.stem(word) for word in cleaned_title.split() if word not in stopwords.words('english')]
        preds = {}
        for cat in self.categories:
            preds[cat] = 0
            for word in words:
                # Get probability for each category using Naive Bayes
                preds[cat] += math.log(self.getProb(cat, word, 1))

        return preds

    
    def classifyData(self):
        """
        Using trained Classifier, predict training and testing data.
        """
        if not self.trained:
            print("You must train the classifier first")
            return
        
        print("Predicting train data")
        self.train_data.loc[:,'PREDICTED'] = ''
        start_min = time.time()
        
        for idx, row in self.train_data.iterrows():
            tr_predictions = self.predictCats(row.iloc[0], already_stopped = True)
            self.train_data.loc[idx,'PREDICTED'] = max(tr_predictions, key = tr_predictions.get)

            if time.time() - start_min > 120:     # Update user on progress every 2 minutes
                print("{:2.1f}% complete".format(100*idx/self.train_data.index[-1]))
                start_min = time.time()

        if len(self.test_data) > 0:
            self.test_data.loc[:,'PREDICTED'] = ''
            print("Predicting test data")
            start_min = time.time()
            for idx, row in self.test_data.iterrows():
                te_predictions = self.predictCats(row.iloc[0], already_stopped = True)
                self.test_data.loc[idx,'PREDICTED'] = max(te_predictions, key = te_predictions.get)

                if time.time() - start_min > 120:
                    print("{:2.1f}% complete".format(100*idx/self.train_data.index[-1]))
                    start_min = time.time()

        self.train_accuracy = sum(self.train_data.PREDICTED == self.train_data.CATEGORY) / len(self.train_data)
        if len(self.test_data > 0):
            self.test_accuracy = sum(self.test_data.PREDICTED == self.test_data.CATEGORY) / len(self.test_data)
            self.misclassified = self.test_data.loc[self.test_data['PREDICTED'] != self.test_data.iloc[:,1]]
        else:
            self.misclassified = self.train_data.loc[self.train_data['PREDICTED'] != self.train_data.iloc[:,1]]