In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

In [3]:
X = pd.read_csv("data/theonion_processed.csv")
X.head()

Unnamed: 0.1,Unnamed: 0,heads,descs
0,0,inclement weather prevents liar from getting t...,PROVIDENCE RI—In spite of his best efforts to ...
1,1,mother comes pretty close to using word stream...,PATERSON NJ—Family sources told reporters Tues...
2,2,richard bransons globalwarming donation nearly...,LONDON—Analysts are predicting that the 3 bill...
3,3,shadow government getting too large to meet in...,COLUMBUS OH—With its membership swelling in re...
4,4,ford develops new suv that runs purely on gaso...,DEARBORN MI—The Ford Motor Company announced W...


In [17]:
tokens = X['descs']

In [18]:
cv = CountVectorizer(max_df=0.85)
X_cv = cv.fit_transform(tokens)


In [19]:
print(list(cv.vocabulary_.keys())[:10])

['providence', 'ri', 'spite', 'his', 'best', 'efforts', 'brave', 'ongoing', 'winter', 'storm']


In [20]:
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(X_cv)

TfidfTransformer()

In [21]:
cv.get_feature_names()

['00',
 '00000000001',
 '0000000001ounce',
 '0000001',
 '0000004',
 '000001',
 '000002',
 '00003',
 '00005',
 '000082567kln00067x',
 '001',
 '00100000',
 '00100111',
 '00101100',
 '00101101',
 '00101110',
 '002',
 '003',
 '005',
 '007',
 '01',
 '0100',
 '01000001',
 '01000100',
 '01001001',
 '01100001',
 '01100011',
 '01100100',
 '01100101',
 '01100110',
 '01100111',
 '01101000',
 '01101001',
 '01101011',
 '01101100',
 '01101101',
 '01101110',
 '01101111',
 '01110000',
 '01110010',
 '01110011',
 '01110100',
 '01110101',
 '01110110',
 '01110111',
 '01111001',
 '0115',
 '014',
 '02',
 '020',
 '025',
 '02milligram',
 '03',
 '03130',
 '04',
 '05',
 '06',
 '0600',
 '07',
 '0700',
 '072',
 '074',
 '075off',
 '07magnitude',
 '08',
 '08ounce',
 '08second',
 '09',
 '095400',
 '10',
 '100',
 '1000',
 '10000',
 '100000',
 '1000000',
 '100000aday',
 '10000degree',
 '10000degreefahrenheit',
 '10000foot',
 '10000milelong',
 '1000foottall',
 '1000gallon',
 '1000pound',
 '1000pump',
 '1000th',
 '1000w

In [22]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_top_n_from_vector(feature_names, sorted_items, top_n=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    if top_n is not None:
        sorted_items = sorted_items[:top_n]

    score_vals = []
    feature_vals = []

    for idx, score in sorted_items:
        fname = feature_names[idx]
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [23]:
feature_names = cv.get_feature_names()
keywords = []
for single_tokens in tokens:
    tfidf_vector = tfidf_transformer.transform(cv.transform([single_tokens]))
    
    sorted_items = sort_coo(tfidf_vector.tocoo())

    keywords.append(extract_top_n_from_vector(feature_names,sorted_items,10))


In [24]:
X['keywords']=keywords
print(X)

       Unnamed: 0                                              heads  \
0               0  inclement weather prevents liar from getting t...   
1               1  mother comes pretty close to using word stream...   
2               2  richard bransons globalwarming donation nearly...   
3               3  shadow government getting too large to meet in...   
4               4  ford develops new suv that runs purely on gaso...   
...           ...                                                ...   
10575       10575  polish rapper under fire for use of the word p...   
10576       10576       jews to celebrate rosh hashasha or something   
10577       10577  internal affairs investigator disappointed con...   
10578       10578  mars probe destroyed by orbiting spielberggate...   
10579       10579                 dad clarifies this not a food stop   

                                                   descs  \
0      PROVIDENCE RI—In spite of his best efforts to ...   
1      PATERSON

In [25]:

print("\n=====Words=====")
print(tokens[3])
print("\n===Keywords===")
for k in keywords[3]:
    print(k,keywords[3][k])


=====Words=====
COLUMBUS OH—With its membership swelling in recent months the mysterious organization that secretly pulls the levers of American power was forced to suspend its weekly meeting Monday having grown too big to fit inside Marriott Conference Room B To successfully carry out our clandestine operations and continue maintaining the ignorance of the masses we will now require the full amenities of Conference Room A said an unidentified man who is believed to have covertly orchestrated the economic collapse of Iceland last year We must postpone the Cataclysmic Event until such time as a more comfortable meeting space is available According to confidential records the hidden regimes enrollment has more than doubled since it gained free access to the Marriotts swimming pool and gym facilities

===Keywords===
marriotts 0.177
meeting 0.176
conference 0.166
levers 0.16
enrollment 0.16
covertly 0.16
marriott 0.157
regimes 0.151
postpone 0.151
orchestrated 0.151


In [26]:
X.to_csv("data/keyworded.csv")