In [1]:
import os
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer # we won't be doing stemming and rather perform lemmatizing
from nltk.stem import WordNetLemmatizer
from stop_words import get_stop_words

from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("./multiclass_classifier_data/boydstun_nyt_frontpage_dataset_1996-2006_0_pap2014_recoding_updated2018.csv").set_index("id").dropna(subset=['title'])

In [3]:
df.columns

Index(['year', 'majortopic', 'subtopic', 'title', 'summary'], dtype='object')

In [4]:
shuffled = df.sample(frac=1)

In [5]:
n_train = int(shuffled.shape[0] * .6)
n_develop = int(shuffled.shape[0] * .2)
n_test = int(shuffled.shape[0] * .2)

In [6]:
train_df = shuffled.iloc[:n_train].reset_index()
develop_df = shuffled.iloc[n_train:n_train+n_develop].reset_index()
test_df = shuffled.iloc[n_train+n_develop:n_train+n_develop+n_test].reset_index()

In [7]:
train_df

Unnamed: 0,id,year,majortopic,subtopic,title,summary
0,9747,1999,6,602,TERROR IN LITTLETON: THE INVESTIGATION;,columbine massacre; sheriff acknowledges not f...
1,28630,2006,20,2013,"Come October, Baby Will Make 300 Million or So",rapidly growing US population
2,21576,2003,19,1921,AFTEREFFECTS: IRAN'S INFLUENCE; CLERIC IN IRA...,Iran calls on Iraqi Shiites to take power
3,27411,2005,20,2012,"For Lobbyist, a Seat of Power Came With a Plate",Abramoff's dealings in Washington
4,18146,2002,19,1921,"A Test of Wills, A Deep Divide",arab-israeli conflict
...,...,...,...,...,...,...
18613,1759,1996,10,1003,THE FATE OF FLIGHT 800: THE AIRLINE; Slowness ...,TWA crash investigation
18614,17789,2002,16,1619,Walker Arrives in Virginia,American Taliban arrives in Virginia
18615,6659,1998,12,1201,Flush but Crime-Wary Cities Bid Up Pay for Pol...,Cities bid up pay to keep police chiefs
18616,23461,2003,21,2101,"INSIDE; A Vast Museum Opens, Not Without Pro...",National Air and Space museum opens


In [8]:
vocab = {}

# set of stopwords, we want to remove
stop_words = set(get_stop_words('en'))

# our lemmatizer
lemmatizer = WordNetLemmatizer()


for i, row in develop_df.iterrows():
    words = word_tokenize(row['title'])

    for word in words:
        word = word.lower()
        if word not in stop_words and word.isalpha():
            token = str(lemmatizer.lemmatize(word))
            if vocab.get(token):
                vocab[token] += 1
            else:
                vocab[token] = 1

In [9]:
vocab

{'second': 20,
 'special': 100,
 'report': 137,
 'death': 87,
 'lesson': 6,
 'policing': 2,
 'doctor': 27,
 'delivering': 1,
 'justice': 45,
 'mix': 3,
 'trial': 49,
 'error': 5,
 'morgue': 2,
 'ceaselessly': 1,
 'sifting': 3,
 'trace': 5,
 'nature': 5,
 'group': 26,
 'say': 183,
 'foe': 25,
 'bear': 5,
 'friendly': 1,
 'name': 19,
 'black': 30,
 'writer': 5,
 'seize': 4,
 'glamorous': 1,
 'ground': 24,
 'around': 12,
 'lit': 1,
 'buy': 14,
 'best': 10,
 'access': 10,
 'congress': 49,
 'moving': 5,
 'border': 19,
 'wed': 2,
 'rich': 11,
 'poor': 25,
 'town': 45,
 'hemingway': 1,
 'bullfight': 1,
 'tale': 8,
 'turn': 35,
 'denomination': 1,
 'one': 59,
 'cleric': 11,
 'fill': 14,
 'need': 17,
 'politics': 80,
 'strategy': 24,
 'state': 81,
 'florida': 39,
 'victory': 24,
 'sure': 5,
 'thing': 5,
 'campaign': 122,
 'impression': 1,
 'brainy': 1,
 'robot': 1,
 'start': 20,
 'stepping': 3,
 'daily': 6,
 'life': 58,
 'firm': 10,
 'fault': 6,
 'fairness': 2,
 'aid': 73,
 'fatal': 9,
 'nightc

In [10]:
sorted_x = sorted(vocab.items(), key=lambda kv: kv[1])
len(sorted_x)

7128

In [11]:
top100 = sorted_x[-100:]
top1000 = sorted_x[-1000:]
top2000 = sorted_x[-2000:]
top3000 = sorted_x[-3000:]
top5000 = sorted_x[-5000:]

In [13]:
num = [100]

for i, n in enumerate(num):
    vocab = sorted_x[-n:]
    vocab = list(map(lambda x: x[0], vocab))
    
    vector = pd.DataFrame(0, index=np.arange(train_df.shape[0]), columns=vocab)
    for i, row in train_df.iterrows():
        words = word_tokenize(row['title'])
        if len(words) == 0:
            print(row)
        for word in words:
            word = word.lower()
            for v in vocab:
                if word == v:
                    vector.at[i, v] += 1

In [14]:
y = train_df['majortopic']
X = vector

In [15]:
from sklearn.linear_model import LogisticRegression
X

Unnamed: 0,million,israeli,business,dead,said,testing,right,now,fear,show,...,president,overview,nation,war,clinton,plan,say,iraq,bush,new
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18613,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18614,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18615,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18616,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
clf = LogisticRegression(penalty='none', max_iter=10000).fit(X, y)

In [17]:
num = [100]

for i, n in enumerate(num):
    vocab = sorted_x[-n:]
    vocab = list(map(lambda x: x[0], vocab))
    
    vector2 = pd.DataFrame(0, index=np.arange(test_df.shape[0]), columns=vocab)
    for i, row in test_df.iterrows():
        words = word_tokenize(row['title'])
        if len(words) == 0:
            print(row)
        for word in words:
            word = word.lower()
            for v in vocab:
                if word == v:
                    vector.at[i, v] += 1

### No penalty

In [None]:
num = [2500, 5000, 7000]

for i, n in enumerate(num):
    vocab = sorted_x[-n:]
    vocab = list(map(lambda x: x[0], vocab))
    print("Using vocab size: %i" % n)
    vector = pd.DataFrame(0, index=np.arange(train_df.shape[0]), columns=vocab)
    for i, row in train_df.iterrows():
        if (i % 5000 == 0):
            print("Progress in processing training data: %f" % (i*1./train_df.shape[0]))
        words = word_tokenize(row['title'])
        if len(words) == 0:
            print(row)
        for word in words:
            word = word.lower()
            for v in vocab:
                if word == v:
                    vector.at[i, v] += 1
    vector2 = pd.DataFrame(0, index=np.arange(test_df.shape[0]), columns=vocab)
    for i, row in test_df.iterrows():
        if(i % 5000 == 0):
            print("Progress in processing testing data: %f" % (i*1./test_df.shape[0]))
        words = word_tokenize(row['title'])
        if len(words) == 0:
            print(row)
        for word in words:
            word = word.lower()
            for v in vocab:
                if word == v:
                    vector2.at[i, v] += 1
    y = train_df['majortopic']
    X = vector
    clf = LogisticRegression(penalty='none', max_iter=10000).fit(X, y)
    y_cal = pd.DataFrame(data=clf.predict(vector2), columns=['prediction'])
    x = pd.concat([test_df, y_cal], axis=1)
    correct = x[x['majortopic'] == x['prediction']].shape[0]
    total = x.shape[0]
    print("Accuracy: %.4f" % (correct/total*1.))
    
    topics = list(set(x['majortopic'].values.tolist()))
    precision = 0
    recall = 0
    p_n = 0
    r_n = 0
    for topic in topics:
        true_positive = x[(x['majortopic'] == topic) & (x['prediction'] == topic)].shape[0]
        all_predicted = x[x['prediction'] == topic].shape[0]
        all_correct = x[x['majortopic'] == topic].shape[0]
        if(all_predicted > 0):
            p_n += 1
            precision += (true_positive / all_predicted)
        if (all_correct > 0):
            r_n += 1
            recall += (true_positive / all_correct)
    print("Precision: %.4f" % (precision/p_n))
    print("Recall: %.4f" % (recall/r_n))
    print("F-Measure: %.4f" % (1./((1/(recall/r_n)) + (1/(precision/p_n)))))

Using vocab size: 2500
Progress in processing training data: 0.000000
Progress in processing training data: 0.268557


### L2 Penalty

In [18]:
num = [100, 1000, 2500, 5000, 7000]

for i, n in enumerate(num):
    vocab = sorted_x[-n:]
    vocab = list(map(lambda x: x[0], vocab))
    print("Using vocab size: %i" % n)
    vector = pd.DataFrame(0, index=np.arange(train_df.shape[0]), columns=vocab)
    for i, row in train_df.iterrows():
        if (i % 5000 == 0):
            print("Progress in processing training data: %f" % (i*1./train_df.shape[0]))
        words = word_tokenize(row['title'])
        if len(words) == 0:
            print(row)
        for word in words:
            word = word.lower()
            for v in vocab:
                if word == v:
                    vector.at[i, v] += 1
    vector2 = pd.DataFrame(0, index=np.arange(test_df.shape[0]), columns=vocab)
    for i, row in test_df.iterrows():
        if(i % 5000 == 0):
            print("Progress in processing testing data: %f" % (i*1./test_df.shape[0]))
        words = word_tokenize(row['title'])
        if len(words) == 0:
            print(row)
        for word in words:
            word = word.lower()
            for v in vocab:
                if word == v:
                    vector2.at[i, v] += 1
    y = train_df['majortopic']
    X = vector
    clf = LogisticRegression(penalty='l2', max_iter=10000).fit(X, y)
    y_cal = pd.DataFrame(data=clf.predict(vector2), columns=['prediction'])
    x = pd.concat([test_df, y_cal], axis=1)
    correct = x[x['majortopic'] == x['prediction']].shape[0]
    total = x.shape[0]
    print("Accuracy: %.4f" % (correct/total*1.))
    
    topics = list(set(x['majortopic'].values.tolist()))
    precision = 0
    recall = 0
    p_n = 0
    r_n = 0
    for topic in topics:
        true_positive = x[(x['majortopic'] == topic) & (x['prediction'] == topic)].shape[0]
        all_predicted = x[x['prediction'] == topic].shape[0]
        all_correct = x[x['majortopic'] == topic].shape[0]
        if(all_predicted > 0):
            p_n += 1
            precision += (true_positive / all_predicted)
        if (all_correct > 0):
            r_n += 1
            recall += (true_positive / all_correct)
    print("Recall: %.4f" % (recall/r_n))
    print("Precision: %.4f" % (precision/p_n))

Unnamed: 0,million,israeli,business,dead,said,testing,right,now,fear,show,...,president,overview,nation,war,clinton,plan,say,iraq,bush,new
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18613,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18614,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18615,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18616,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


array([20, 19, 19, ...,  3, 19, 16])

In [19]:
d = test_df

In [20]:
y_cal = pd.DataFrame(data=clf.predict(vector), columns=['prediction'])

In [21]:
x = pd.concat([d, y_cal], axis=1)
correct = x[x['majortopic'] == x['prediction']].shape[0]
total = x.shape[0]

In [22]:
correct/total*1.

0.09528413363411752