In [39]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from sklearn.externals import joblib
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import TruncatedSVD

In [52]:
labels = pd.read_pickle('labels_df')

In [54]:
labels.ix[14].countries

{u'China': {'count': 2, 'probability': 1.0},
 u'Colombia': {'count': 1, 'probability': 0.25},
 u'Russian Federation': {'count': 1, 'probability': 0.16666666666666666},
 u'United Kingdom': {'count': 1, 'probability': 0.25},
 u'United States': {'count': 9, 'probability': 1.0}}

In [55]:
sample = labels.ix[0:10]

In [56]:
explode_probabilities = lambda x: pd.Series({j: k['probability'] for j, k in x.items()})

labels = pd.concat([labels, labels.countries.apply(explode_probabilities)], axis=1)

In [57]:
non_country_columns = ['raw_text_url', 'title', 'toc_subject', 'topics', 'raw_text', 'entities', 'countries']
country_columns = sorted(list(set(labels.columns.tolist()) - set(non_country_columns)))

In [58]:
labels.head()

Unnamed: 0,raw_text_url,title,toc_subject,topics,raw_text,entities,countries,Afghanistan,Albania,Algeria,...,Uruguay,Uzbekistan,Vanuatu,"Venezuela, Bolivarian Republic of",Viet Nam,"Virgin Islands, British","Virgin Islands, U.S.",Yemen,Zambia,Zimbabwe
0,https://www.federalregister.gov/articles/text/...,Culturally Significant Objects Imported for Ex...,Culturally Significant Objects Imported for Ex...,[],\nSUMMARY: \nNotice is hereby given of the fol...,"[(Mauritshuis, ORGANIZATION), (United States, ...","{u'Canada': {u'count': 1, u'probability': 0.07...",,,,...,0.071429,,,0.071429,,,,,,
1,https://www.federalregister.gov/articles/text/...,Proposed Collection: Comment Request,,[],\nACTION: \nNotice and request for comments. \...,"[(Treasury, ORGANIZATION), (Treasury, ORGANIZA...","{u'United States': {u'count': 2, u'probability...",,,,...,,,,,,,,,,
2,https://www.federalregister.gov/articles/text/...,National Medal of Technology and Innovation Ca...,Calls for Nominations:,[],\nACTION: \nNotice and request for nominations...,"[(Department of Commerce, ORGANIZATION), (Unit...","{u'Canada': {u'count': 1, u'probability': 0.08...",,,,...,,,,,,,,,,
3,https://www.federalregister.gov/articles/text/...,Additional Designations of Individuals Pursuan...,Blocking or Unblocking of Persons and Property:,[],\nACTION: \nNotice. \nSUMMARY: \nThe U.S. Depa...,"[(U.S. Department of the Treasury, ORGANIZATIO...","{u'United Kingdom': {u'count': 1, u'probabilit...",,,,...,,,,,,,,,,
4,https://www.federalregister.gov/articles/text/...,Fisheries of the Northeastern United States; S...,Fisheries of the Northeastern United States:,[],\nACTION: \nTemporary rule; quota transfer. \n...,"[(NMFS, ORGANIZATION), (North Carolina, LOCATI...","{u'United States': {u'count': 7, u'probability...",,,,...,,,,,,,,,,


In [59]:
stopped = stopwords.words('english')

In [60]:
labels[['title', 'toc_subject']] = labels[['title', 'toc_subject']].fillna('')
documents = labels.title + ' ' + labels.toc_subject + ' ' + labels.topics.apply(lambda x: ' '.join(x)) + labels.raw_text

vec = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,3), stop_words=stopped)

# tfidf vectorizer automatically normalizes
# tfidf = vec.fit_transform(documents.tolist())

# joblib.dump(tfidf, 'tfidf.pkl')
tfidf = joblib.load('tfidf.pkl')

In [61]:
svd = TruncatedSVD(n_components=100, random_state=42)
# lsa = svd.fit_transform(tfidf)
# joblib.dump(lsa, 'lsa.pkl')
lsa = joblib.load('lsa.pkl')

In [62]:
def probability_model(country):
    # filling in the blanks with a predictive model
#     X = tfidf
    X = lsa # lose some accuracy to speed things up
    est = LinearSVC(penalty='l1', dual=False)
    est = CalibratedClassifierCV(est, cv=3) # increase cv for better accuracy
    y = labels[country].apply(lambda x: 1 if x>0.5 else 0)
    try:
        est.fit(X, y)
        prediction = est.predict(X)
        t = labels[[country]]
        t['labeled_class'] = y
        t['predicted_class'] = prediction
        t['probability'] = [i[1] for i in est.predict_proba(X)]
    #     temp = t[(t['predicted_class'] == 1) & (t['labeled_class']== 0)]
    #     print classification_report(y, prediction)
    #     print temp.shape
    #     print temp
        return t.probability
    except Exception as e:
        print(country, e)
        return y

In [63]:
for country in country_columns:
    t = probability_model(country)

    # keep the original probability where greater than the modeled probability
    labels[country] = labels[country].where(labels[country] > t, t)
    
    # returns errors where not enough pre-labeled data for that country

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(u'Andorra', ValueError('Requesting 3-fold cross-validation but provided less than 3 examples for at least one class.',))
(u'Bhutan', ValueError('Requesting 3-fold cross-validation but provided less than 3 examples for at least one class.',))
(u'Bosnia and Herzegovina', ValueError('Requesting 3-fold cross-validation but provided less than 3 examples for at least one class.',))
(u'Botswana', ValueError('Requesting 3-fold cross-validation but provided less than 3 examples for at least one class.',))
(u'Faroe Islands', ValueError('Requesting 3-fold cross-validation but provided less than 3 examples for at least one class.',))
(u'Holy See (Vatican City State)', ValueError('This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0',))
(u'Jersey', ValueError('Requesting 3-fold cross-validation but provided less than 3 examples for at least one class.',))
(u'Lesotho', ValueError('Requesting 3-fold cross-validation but provided less than 3 examples fo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [64]:
labels = labels.rename(columns={'countries':'pre-labeled countries'})

In [65]:
labels.head()

Unnamed: 0,raw_text_url,title,toc_subject,topics,raw_text,entities,pre-labeled countries,Afghanistan,Albania,Algeria,...,Uruguay,Uzbekistan,Vanuatu,"Venezuela, Bolivarian Republic of",Viet Nam,"Virgin Islands, British","Virgin Islands, U.S.",Yemen,Zambia,Zimbabwe
0,https://www.federalregister.gov/articles/text/...,Culturally Significant Objects Imported for Ex...,Culturally Significant Objects Imported for Ex...,[],\nSUMMARY: \nNotice is hereby given of the fol...,"[(Mauritshuis, ORGANIZATION), (United States, ...","{u'Canada': {u'count': 1, u'probability': 0.07...",0.002095,0.000932,0.001163,...,0.071429,0.000988,0.000697,0.071429,0.007562,0.000977,0.006194,0.001408,0.000666,0.001488
1,https://www.federalregister.gov/articles/text/...,Proposed Collection: Comment Request,,[],\nACTION: \nNotice and request for comments. \...,"[(Treasury, ORGANIZATION), (Treasury, ORGANIZA...","{u'United States': {u'count': 2, u'probability...",0.002207,0.001381,0.001117,...,7.9e-05,0.000997,0.000319,0.002957,0.010015,0.000657,0.005247,0.001346,0.000666,0.001565
2,https://www.federalregister.gov/articles/text/...,National Medal of Technology and Innovation Ca...,Calls for Nominations:,[],\nACTION: \nNotice and request for nominations...,"[(Department of Commerce, ORGANIZATION), (Unit...","{u'Canada': {u'count': 1, u'probability': 0.08...",0.002271,0.000751,0.001401,...,0.003621,0.002398,0.000784,0.003522,0.008085,0.001174,0.005724,0.001978,0.000666,0.001691
3,https://www.federalregister.gov/articles/text/...,Additional Designations of Individuals Pursuan...,Blocking or Unblocking of Persons and Property:,[],\nACTION: \nNotice. \nSUMMARY: \nThe U.S. Depa...,"[(U.S. Department of the Treasury, ORGANIZATIO...","{u'United Kingdom': {u'count': 1, u'probabilit...",0.008639,0.000797,0.001883,...,0.003442,0.005527,0.000299,0.008629,0.008179,0.003873,0.005274,0.002994,0.000666,0.0076
4,https://www.federalregister.gov/articles/text/...,Fisheries of the Northeastern United States; S...,Fisheries of the Northeastern United States:,[],\nACTION: \nTemporary rule; quota transfer. \n...,"[(NMFS, ORGANIZATION), (North Carolina, LOCATI...","{u'United States': {u'count': 7, u'probability...",0.001133,0.000193,0.001305,...,0.001923,0.001425,0.001711,0.00224,0.009529,0.00143,0.009279,0.002212,0.000666,0.001492


In [67]:
labels.to_pickle('predicted_countries_df')

In [68]:
labels = pd.read_pickle('predicted_countries_df')

In [78]:
row_list = []
for row in labels.iterrows():
    country_dict = {}
    for country in country_columns:
        probability = row[1][country]
        if probability > 0.3:
            country_dict[country] = probability
    row_list.append({'id': row[0], 'title': row[1].title, 'link': row[1].raw_text_url, 'probabilities': country_dict})

In [79]:
import json

In [80]:
with open('parsed.json', 'w') as f:
    json.dump(row_list, f, indent=4, sort_keys=True)