In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.externals import joblib
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import TruncatedSVD

In [2]:
labels = pd.read_pickle('labels_df')

In [3]:
sample = labels.ix[0:10]

In [4]:
explode_probabilities = lambda x: pd.Series({j: k['probability'] for j, k in x.items()})

labels = pd.concat([labels, labels.countries.apply(explode_probabilities)], axis=1)

In [5]:
non_country_columns = ['raw_text_url', 'title', 'toc_subject', 'topics', 'raw_text', 'entities', 'countries']
country_columns = sorted(list(set(labels.columns.tolist()) - set(non_country_columns)))

In [28]:
labels.head()

Unnamed: 0,raw_text_url,title,toc_subject,topics,raw_text,entities,pre-labeled countries,Afghanistan,Albania,Algeria,...,Uruguay,Uzbekistan,Vanuatu,"Venezuela, Bolivarian Republic of",Viet Nam,"Virgin Islands, British","Virgin Islands, U.S.",Yemen,Zambia,Zimbabwe
0,https://www.federalregister.gov/articles/text/...,Culturally Significant Objects Imported for Ex...,Culturally Significant Objects Imported for Ex...,[],\nSUMMARY: \nNotice is hereby given of the fol...,"[(Mauritshuis, ORGANIZATION), (United States, ...","{u'Canada': {u'count': 1, u'probability': 0.07...",0.002095,0.000932,0.001163,...,0.071429,0.000988,0.000697,0.071429,0.00719,0.000977,0.005605,0.001408,0.000666,0.001488
1,https://www.federalregister.gov/articles/text/...,Proposed Collection: Comment Request,,[],\nACTION: \nNotice and request for comments. \...,"[(Treasury, ORGANIZATION), (Treasury, ORGANIZA...","{u'United States': {u'count': 1, u'probability...",0.002207,0.001381,0.001117,...,7.9e-05,0.000997,0.000319,0.003269,0.009604,0.000657,0.004183,0.001346,0.000666,0.001565
2,https://www.federalregister.gov/articles/text/...,National Medal of Technology and Innovation Ca...,Calls for Nominations:,[],\nACTION: \nNotice and request for nominations...,"[(Department of Commerce, ORGANIZATION), (Unit...","{u'Canada': {u'count': 1, u'probability': 0.08...",0.002271,0.000751,0.001401,...,0.003621,0.002398,0.000784,0.003825,0.007944,0.001174,0.004918,0.001978,0.000666,0.001691
3,https://www.federalregister.gov/articles/text/...,Additional Designations of Individuals Pursuan...,Blocking or Unblocking of Persons and Property:,[],\nACTION: \nNotice. \nSUMMARY: \nThe U.S. Depa...,"[(U.S. Department of the Treasury, ORGANIZATIO...","{u'United Kingdom': {u'count': 1, u'probabilit...",0.008639,0.000797,0.001883,...,0.003442,0.005527,0.000299,0.008596,0.008004,0.003873,0.004249,0.002994,0.000666,0.0076
4,https://www.federalregister.gov/articles/text/...,Fisheries of the Northeastern United States; S...,Fisheries of the Northeastern United States:,[],\nACTION: \nTemporary rule; quota transfer. \n...,"[(NMFS, ORGANIZATION), (North Carolina, LOCATI...","{u'United States': {u'count': 7, u'probability...",0.001133,0.000193,0.001305,...,0.001923,0.001425,0.001711,0.002086,0.009294,0.00143,0.007493,0.002212,0.000666,0.001492


In [7]:
stopped = stopwords.words('english')

In [8]:
labels[['title', 'toc_subject']] = labels[['title', 'toc_subject']].fillna('')
documents = labels.title + ' ' + labels.toc_subject + ' ' + labels.topics.apply(lambda x: ' '.join(x)) + labels.raw_text

vec = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,3), stop_words=stopped)

# tfidf vectorizer automatically normalizes
# tfidf = vec.fit_transform(documents.tolist())

# joblib.dump(tfidf, 'tfidf.pkl')
tfidf = joblib.load('tfidf.pkl')

In [17]:
svd = TruncatedSVD(n_components=100, random_state=42)
# lsa = svd.fit_transform(tfidf)
# joblib.dump(lsa, 'lsa.pkl')
lsa = joblib.load('lsa.pkl')

In [18]:
def probability_model(country):
    # filling in the blanks with a predictive model
#     X = tfidf
    X = lsa # lose some accuracy to speed things up
    est = LinearSVC(penalty='l1', dual=False)
    est = CalibratedClassifierCV(est, cv=3) # increase cv for better accuracy
    y = labels[country].apply(lambda x: 1 if x>0.5 else 0)
    try:
        est.fit(X, y)
        prediction = est.predict(X)
        t = labels[[country]]
        t['labeled_class'] = y
        t['predicted_class'] = prediction
        t['probability'] = [i[1] for i in est.predict_proba(X)]
    #     temp = t[(t['predicted_class'] == 1) & (t['labeled_class']== 0)]
    #     print classification_report(y, prediction)
    #     print temp.shape
    #     print temp
        return t.probability
    except Exception as e:
        print(country, e)
        return y

In [19]:
for country in country_columns:
    t = probability_model(country)

    # keep the original probability where greater than the modeled probability
    labels[country] = labels[country].where(labels[country] > t, t)
    
    # returns errors where not enough pre-labeled data for that country

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(u'Andorra', ValueError('This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0',))
(u'Anguilla', ValueError('This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0',))
(u'Bhutan', ValueError('Requesting 3-fold cross-validation but provided less than 3 examples for at least one class.',))
(u'Bolivia, Plurinational State of', ValueError('This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0',))
(u'Bonaire, Sint Eustatius and Saba', ValueError('This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0',))
(u'Bosnia and Herzegovina', ValueError('Requesting 3-fold cross-validation but provided less than 3 examples for at least one class.',))
(u'Botswana', ValueError('Requesting 3-fold cross-validation but provided less than 3 examples for at least one class.',))
(u'Comoros', ValueError('Requesting 3-fold cross-

In [24]:
labels = labels.rename(columns={'countries':'pre-labeled countries'})

In [29]:
labels.head()

Unnamed: 0,raw_text_url,title,toc_subject,topics,raw_text,entities,pre-labeled countries,Afghanistan,Albania,Algeria,...,Uruguay,Uzbekistan,Vanuatu,"Venezuela, Bolivarian Republic of",Viet Nam,"Virgin Islands, British","Virgin Islands, U.S.",Yemen,Zambia,Zimbabwe
0,https://www.federalregister.gov/articles/text/...,Culturally Significant Objects Imported for Ex...,Culturally Significant Objects Imported for Ex...,[],\nSUMMARY: \nNotice is hereby given of the fol...,"[(Mauritshuis, ORGANIZATION), (United States, ...","{u'Canada': {u'count': 1, u'probability': 0.07...",0.002095,0.000932,0.001163,...,0.071429,0.000988,0.000697,0.071429,0.00719,0.000977,0.005605,0.001408,0.000666,0.001488
1,https://www.federalregister.gov/articles/text/...,Proposed Collection: Comment Request,,[],\nACTION: \nNotice and request for comments. \...,"[(Treasury, ORGANIZATION), (Treasury, ORGANIZA...","{u'United States': {u'count': 1, u'probability...",0.002207,0.001381,0.001117,...,7.9e-05,0.000997,0.000319,0.003269,0.009604,0.000657,0.004183,0.001346,0.000666,0.001565
2,https://www.federalregister.gov/articles/text/...,National Medal of Technology and Innovation Ca...,Calls for Nominations:,[],\nACTION: \nNotice and request for nominations...,"[(Department of Commerce, ORGANIZATION), (Unit...","{u'Canada': {u'count': 1, u'probability': 0.08...",0.002271,0.000751,0.001401,...,0.003621,0.002398,0.000784,0.003825,0.007944,0.001174,0.004918,0.001978,0.000666,0.001691
3,https://www.federalregister.gov/articles/text/...,Additional Designations of Individuals Pursuan...,Blocking or Unblocking of Persons and Property:,[],\nACTION: \nNotice. \nSUMMARY: \nThe U.S. Depa...,"[(U.S. Department of the Treasury, ORGANIZATIO...","{u'United Kingdom': {u'count': 1, u'probabilit...",0.008639,0.000797,0.001883,...,0.003442,0.005527,0.000299,0.008596,0.008004,0.003873,0.004249,0.002994,0.000666,0.0076
4,https://www.federalregister.gov/articles/text/...,Fisheries of the Northeastern United States; S...,Fisheries of the Northeastern United States:,[],\nACTION: \nTemporary rule; quota transfer. \n...,"[(NMFS, ORGANIZATION), (North Carolina, LOCATI...","{u'United States': {u'count': 7, u'probability...",0.001133,0.000193,0.001305,...,0.001923,0.001425,0.001711,0.002086,0.009294,0.00143,0.007493,0.002212,0.000666,0.001492


In [27]:
labels.to_pickle('predicted_countries_df')

In [9]:
labels = pd.read_pickle('predicted_countries_df')

In [25]:
row_list = []
for row in labels.iterrows():
    country_dict = {}
    for country in country_columns:
        probability = row[1][country]
        if probability > 0.5:
            country_dict[country] = probability
    row_list.append({'title': row[1].title, 'link': row[1].raw_text_url, 'probabilities': country_dict})

In [17]:
import json

In [26]:
with open('parsed.json', 'w') as f:
    json.dump(row_list, f, indent=4, sort_keys=True)