In [18]:
import annoy
from annoy import AnnoyIndex
import random
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import time
from tqdm import tqdm

In [2]:
speech = pd.read_pickle("Partisan-Responses/search_dataset_small.pkl")

In [3]:
len(speech)

931553

In [4]:
speech.head()

Unnamed: 0,speech,party,stemmed_speech,year
0,"Mr. President, I wish to report on behalf of m...",R,"mr. presid , i wish to report on behalf of mys...",98
1,"Mr. President, as most of my colleagues are aw...",D,"mr. presid , as most of my colleagu are awar ,...",98
2,"Madam President, I send to the desk a resoluti...",R,"madam presid , i send to the desk a resolut on...",98
3,"Madam President, before I send a concurrent re...",R,"madam presid , befor i send a concurr resolut ...",98
4,"Madam President, I ask unanimous consent that ...",R,"madam presid , i ask unanim consent that for t...",98


In [None]:
start = time.time()
vectorizer = TfidfVectorizer(stop_words='english', min_df=5, max_df=.5, ngram_range=(1,2))
tfidf = vectorizer.fit_transform(speech['stemmed_speech'])
end = time.time()
print(end - start)

In [None]:
tfidf.shape

In [5]:
def lemmatize(phrase):
    """
    Given some text, returns the lemmatized text

    :param phrase: text to lemmatize
    :return: lemmatized text
    """
    ps = PorterStemmer()
    return " ".join([ps.stem(w.lower()) for w in word_tokenize(phrase)])

In [6]:
def search(question, topk=5):
    query = vectorizer.transform([lemmatize(question)])
    scores = (tfidf * query.T).toarray()
    results = (np.flip(np.argsort(scores, axis=0)))
    return speech.iloc[results[:topk, 0]]

In [20]:
question = "What reforms were adopted by the 110th Congress?"

start = time.time()
results = search(question, topk=10)
end = time.time()
print(end - start)

qa_pipeline = pipeline("question-answering")
question_df = pd.DataFrame.from_records([{
        'question': question,
        'context': res
    } for res in results["speech"]])
preds = qa_pipeline(question_df.to_dict('records'))
answer_df = pd.DataFrame.from_records(preds).sort_values(by="score", ascending=False)

2.9871609210968018


In [21]:
answer_df

Unnamed: 0,score,start,end,answer
9,0.8759375,160,176,floor privileges
1,0.4887475,94,110,The AMERICA Act.
2,0.4428043,50,108,Republican Members of the Senate who will not ...
4,0.3855834,914,935,welfaretowork reform.
8,0.2647674,764,793,"banning gifts from lobbyists,"
3,0.2288905,184,189,rules
7,0.1644282,152,173,emergency designation
5,0.05213201,35,40,rules
6,2.50788e-07,1873,1953,stepping forward to defend freedom against the...
0,1.789373e-07,1102,1140,"charting a new course for our country,"


In [22]:
for res in results["speech"]:
    print(res)
    print()

Mr. Speaker, tonight I would like to welcome you. Mr. Speaker, and the American people to the Accountability Congress. Over the next 1 hour, my freshman colleagues and I will be claiming this hour to talk about the accomplishments of this 110th Congress. We have seen not only an auspicious and bold, brave, new agenda for the first 100 hours, but also the first 100 days. And we are not just going to talk about and celebrate the accomplishments of the last 100 days. We are going to talk about a vision for our country and talk about what will happen in the days to come. It is important. Mr. Speaker, that the American people know that by getting a new majority in the Congress that they have signed up to get a vision that is inclusive, that brings Americans all together, that makes for a safer America, a fairer economy, that makes for an economy where working people, middleclass people can strive and do well in our society. And joining me tonight with the members of the freshman class are a

In [7]:
speech_111 = speech[speech['year'] == '111']
print(len(speech_111))

55439


In [21]:
start = time.time()
vectorizer_111 = TfidfVectorizer(stop_words='english', min_df=5, max_df=.5, ngram_range=(1,3))
tfidf_111 = vectorizer_111.fit_transform(speech_111['stemmed_speech'])
end = time.time()
print(end - start)

90.48930287361145


In [None]:
data = {
    'vec': vectorizer_111,
    'tfidf': tfidf_111
}
with open("tfidf_data_111.pkl", 'wb') as file:
    pickle.dump(data, file)

In [22]:
tfidf_111.shape

(55439, 374167)

In [23]:
def search(question, topk=5):
    query = vectorizer_111.transform([lemmatize(question)])
    scores = (tfidf_111 * query.T).toarray()
    results = (np.flip(np.argsort(scores, axis=0)))
    return speech_111.iloc[results[:topk, 0]]

In [24]:
question = "What reforms were adopted by the 110th Congress?"

start = time.time()
results = search(question, topk=10)
end = time.time()
print(end - start)

qa_pipeline = pipeline("question-answering")
question_df = pd.DataFrame.from_records([{
        'question': question,
        'context': res
    } for res in results["speech"]])
preds = qa_pipeline(question_df.to_dict('records'))
answer_df = pd.DataFrame.from_records(preds).sort_values(by="score", ascending=False)
answer_df

0.15324115753173828


Unnamed: 0,score,start,end,answer
5,0.592445,395,411,procedural rules
3,0.532642,2744,2793,National Adoption Month and National Adoption Day
0,0.264767,764,793,"banning gifts from lobbyists,"
2,0.254573,504,586,Congress made significant advances in providin...
6,0.2011,113,127,"rules package,"
4,0.192795,502,588,Congress has made significant advances in prov...
1,0.164428,152,173,emergency designation
8,0.001296,854,922,staff members of congressional liaison offices...
9,0.000596,620,689,a family from their district that exemplifies ...
7,0.000212,334,426,strengthen the adoption tax credit and make we...


### Annoy

In [19]:
VECTOR_LENGTH = tfidf_111.shape[1]
METRIC = 'angular'
print(VECTOR_LENGTH)

275835


In [None]:
annoy_index = AnnoyIndex(VECTOR_LENGTH, metric=METRIC)

item_counter = 0
for i in tqdm(range(len(speech_111))):
    annoy_index.add_item(item_counter, tfidf_111.toarray()[i, :])
    item_counter += 1

question_index = item_counter

In [None]:
question = "What reforms were adopted by the 110th Congress?"
query = vectorizer_111.transform([lemmatize(question)])

annoy_index.add_item(question_index, query)

In [None]:
annoy_index.build(n_trees=100)
nr_items = annoy_index.get_n_items()
print(nr_items)

In [None]:
print(annoy_index.get_nns_by_item(question_index, 10))