In [1]:
import re
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
def preprocess(text):
	lemmatizer = WordNetLemmatizer()
	text = text.lower()
	text = re.sub(r'\s+', ' ', text).strip()
	text = re.sub(r'\b(?!1[0-9]{3}\b)(?!20[0-9]{2}\b)\d+\b', '', text)
	text = re.sub(r'\s+', ' ', text).strip() 
	tokens = [lemmatizer.lemmatize(t) for t in  word_tokenize(text)]
	return ' '.join(tokens)


data = pd.read_csv('results/cleaned_topics.csv', encoding='utf-8')
dict = {number: texte for number, texte in zip(data['Number'], data['texte'])}
vectorizer = TfidfVectorizer(stop_words="english", lowercase=True,norm='l1', smooth_idf= True, sublinear_tf=True , preprocessor=preprocess)
X = vectorizer.fit_transform(data['texte'])
feature_names = vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(X.toarray(), columns=feature_names, index=data['Number'])
tfidf_df.to_csv('results/proba(terme sachant topic).csv', index=True, encoding='utf-8')



In [3]:
def generate_QS1_3(topic_number, proba_terme_sachant_topic):
	serie = proba_terme_sachant_topic.loc[topic_number]
	serie = serie[serie > 0].sort_values(ascending=False)
	terms = serie.index.tolist()

	#  pivot d'abord
	matrix = np.outer(serie, serie)
	triu_indices = np.triu_indices(len(serie), k=1)
	words = serie.index.to_numpy()
	word_pairs = list(zip(words[triu_indices[0]], words[triu_indices[1]]))
	joint_probs = matrix[triu_indices]
	joint_probs += np.random.normal(0, 0.001, joint_probs.shape)
	sorted_pairs = [pair for _, pair in sorted(zip(joint_probs, word_pairs), reverse=True)]
	pivot_t1, pivot_t2 = sorted_pairs[0]

	queries = []
	used_terms = set([pivot_t1, pivot_t2]) 
	for term1 in terms:
		if term1 in used_terms:
			continue
		used_terms.add(term1)
		t3_candidates = [t for t in terms if t not in used_terms]
		if not t3_candidates:
			continue
		t3 = t3_candidates[0]
		queries.append((term1,pivot_t1, pivot_t2, t3))
		used_terms.add(t3)

	return pd.DataFrame(queries, columns=['single term', 'pivot t1', 'pivot t2', 't3'])

topic_numbers = data['Number'].unique()
for topic_number in topic_numbers:
	queries = generate_QS1_3(topic_number, tfidf_df)
	topic =  data[data['Number'] == topic_number].iloc[0]
	display(str(topic['Number']) + " : " + str(topic["texte"]))
	display(queries)


'303 : Hubble Telescope Achievements Identify positive accomplishments of the Hubble telescope since it was launched in 1991'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,1991,hubble,telescope,accomplishment
1,achievement,hubble,telescope,launched
2,positive,hubble,telescope,wa


'307 : New Hydroelectric Projects Identify hydroelectric projects proposed or under construction by country and location Detailed description of nature extent purpose problems and consequences is desirable'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,consequence,hydroelectric,project,construction
1,description,hydroelectric,project,desirable
2,detailed,hydroelectric,project,location
3,nature,hydroelectric,project,problem
4,proposed,hydroelectric,project,extent
5,new,hydroelectric,project,country
6,purpose,hydroelectric,project,identify


'310 : Radio Waves and Brain Cancer Evidence that radio waves from radio towers or car phones affect brain cancer occurrence'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,brain,radio,wave,cancer
1,affect,radio,wave,car
2,occurrence,radio,wave,phone
3,tower,radio,wave,evidence


'314 : Marine Vegetation Commercial harvesting of marine vegetation such as algae seaweed and kelp for food and drug purposes'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,algae,marine,vegetation,commercial
1,food,marine,vegetation,harvesting
2,kelp,marine,vegetation,seaweed
3,drug,marine,vegetation,purpose


'322 : International Art Crime Isolate instances of fraud or embezzlement in the international art trade'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,crime,art,international,embezzlement
1,fraud,art,international,isolate
2,trade,art,international,instance


'325 : Cult Lifestyles Describe a cult by name and identify the cult members activities in their everyday life'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,everyday,cult,lifestyle,member
1,activity,cult,lifestyle,life


'330 : Iran Iraq Cooperation This query is looking for examples of cooperation or friendly ties between Iran and Iraq or ways in which the two countries could be considered allies'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,iraq,cooperation,iran,ally
1,considered,cooperation,iran,example
2,friendly,cooperation,iran,looking
3,query,cooperation,iran,tie
4,way,cooperation,iran,country


'336 : Black Bear Attacks A relevant document would discuss the frequency of vicious black bear attacks worldwide and the possible causes for this savage behavior'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,attack,bear,black,behavior
1,frequency,bear,black,possible
2,savage,bear,black,vicious
3,cause,bear,black,relevant
4,worldwide,bear,black,discus


'341 : Airport Security A relevant document would discuss how effective government orders to better scrutinize passengers and luggage on international flights and to step up screening of all carryon baggage has been'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,airport,carryon,order,luggage
1,better,carryon,order,security
2,scrutinize,carryon,order,screening
3,passenger,carryon,order,baggage
4,relevant,carryon,order,step
5,international,carryon,order,government
6,flight,carryon,order,effective
7,ha,carryon,order,discus


'344 : Abuses of EMail The availability of Email to many people through their job or school affiliation has allowed for many efficiencies in communications but also has provided the opportunity for abuses What steps have been taken worldwide by those bearing the cost of Email to prevent excesses'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,ha,email,abuse,availability
1,affiliation,email,abuse,school
2,provided,email,abuse,people
3,opportunity,email,abuse,bearing
4,job,email,abuse,excess
5,efficiency,email,abuse,cost
6,communication,email,abuse,allowed
7,prevent,email,abuse,step
8,taken,email,abuse,worldwide


'345 : Overseas Tobacco Sales Health studies primarily in the US have caused reductions in tobacco sales here but the economic impact has caused US tobacco companies to look overseas for customers What impact have the health and economic factors had overseas'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,tobacco,overseas,health,economic
1,impact,overseas,health,sale
2,caused,overseas,health,customer
3,look,overseas,health,primarily
4,reduction,overseas,health,study
5,company,overseas,health,factor


'347 : Wildlife Extinction The spotted owl episode in America highlighted US efforts to prevent the extinction of wildlife species What is not well known is the effort of other countries to prevent the demise of species native to their countries What other countries have begun efforts to prevent such declines'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,prevent,effort,wildlife,extinction
1,specie,effort,wildlife,country
2,america,effort,wildlife,begun
3,decline,effort,wildlife,demise
4,episode,effort,wildlife,highlighted
5,known,effort,wildlife,owl
6,spotted,effort,wildlife,native


'353 : Antarctica exploration Identify systematic explorations and scientific investigations of Antarctica current or planned'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,current,antarctica,exploration,investigation
1,planned,antarctica,exploration,systematic
2,scientific,antarctica,exploration,identify


'354 : journalist risks Identify instances where a journalist has been put at risk eg killed arrested or taken hostage in the performance of his work'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,arrested,journalist,risk,hostage
1,killed,journalist,risk,performance
2,work,journalist,risk,taken
3,ha,journalist,risk,instance


'362 : human smuggling Identify incidents of human smuggling'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,incident,human,smuggling,identify


'363 : transportation tunnel disasters What disasters have occurred in tunnels used for transportation'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,tunnel,disaster,transportation,occurred


'367 : piracy What modern instances have there been of old fashioned piracy the boarding or taking control of boats'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,boarding,piracy,modern,boat
1,control,piracy,modern,fashioned
2,taking,piracy,modern,old


'372 : Native American casino Identify documents that discuss the growth of Native American casino gambling'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,native,american,casino,gambling
1,growth,american,casino,discus
2,document,american,casino,identify


'374 : Nobel prize winners Identify and provide background information on Nobel prize winners'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,prize,nobel,winner,background
1,information,nobel,winner,provide


'375 : hydrogen energy What is the status of research on hydrogen as a feasible energy source'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,feasible,energy,hydrogen,research
1,source,energy,hydrogen,status


'378 : euro opposition Identify documents that discuss opposition to the introduction of the euro the European currency'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,currency,euro,opposition,european
1,introduction,euro,opposition,discus
2,document,euro,opposition,identify


'383 : mental illness drugs Identify drugs used in the treatment of mental illness'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,drug,illness,mental,treatment
1,used,illness,mental,identify


'389 : illegal technology transfer What specific entities have been accused of illegal technology transfer such as selling their products formulas etc directly or indirectly to foreign entities for other than peaceful purposes'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,entity,illegal,technology,transfer
1,directly,illegal,technology,formula
2,indirectly,illegal,technology,peaceful
3,selling,illegal,technology,specific
4,accused,illegal,technology,foreign
5,product,illegal,technology,purpose


'393 : mercy killing Identify documents that discuss mercy killings'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,discus,killing,mercy,document


'394 : home schooling Identify documents that discuss the education of children at home home schooling'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,child,home,schooling,education
1,discus,home,schooling,document


'397 : automobile recalls Identify documents that discuss the reasons for automobile recalls'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,reason,recall,automobile,discus
1,document,recall,automobile,identify


'399 : oceanographic vessels Identify documents that discuss the activities or equipment of oceanographic vessels'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,equipment,oceanographic,vessel,activity
1,discus,oceanographic,vessel,document


'401 : foreign minorities Germany What language and cultural differences impede the integration of foreign minorities in Germany'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,foreign,germany,minority,cultural
1,difference,germany,minority,impede
2,integration,germany,minority,language


'404 : Ireland peace talks How often were the peace talks in Ireland delayed or disrupted as a result of acts of violence'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,ireland,peace,talk,act
1,delayed,peace,talk,disrupted
2,result,peace,talk,violence


'408 : tropical storms What tropical storms hurricanes and typhoons have caused significant property damage and loss of life'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,hurricane,storm,tropical,property
1,significant,storm,tropical,typhoon
2,caused,storm,tropical,damage
3,life,storm,tropical,loss


'409 : legal Pan Am 103 What legal actions have resulted from the destruction of Pan Am Flight 103 over Lockerbie Scotland on December 21 1988'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,1988,legal,pan,action
1,december,legal,pan,destruction
2,lockerbie,legal,pan,resulted
3,scotland,legal,pan,flight


'416 : Three Gorges Project What is the status of The Three Gorges Project'

Unnamed: 0,single term,pivot t1,pivot t2,t3


'419 : recycle automobile tires What new uses have been developed for old automobile tires as a means of tire recycling'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,developed,tire,automobile,mean
1,recycle,tire,automobile,recycling
2,old,tire,automobile,new


'426 : law enforcement dogs Provide information on the use of dogs worldwide for law enforcement purposes'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,law,dog,enforcement,use
1,information,dog,enforcement,provide
2,purpose,dog,enforcement,worldwide


'427 : UV damage eyes Find documents that discuss the damage ultraviolet UV light from the sun can do to eyes'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,damage,eye,uv,light
1,sun,eye,uv,ultraviolet
2,discus,eye,uv,document


'433 : Greek philosophy stoicism Is there contemporary interest in the Greek philosophy of stoicism'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,greek,philosophy,stoicism,contemporary


'435 : curbing population growth What measures have been taken worldwide and what countries have been effective in curbing population growth'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,growth,curbing,population,measure
1,effective,curbing,population,taken
2,country,curbing,population,worldwide


'436 : railway accidents What are the causes of railway accidents throughout the world'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,cause,accident,railway,world


'439 : inventions scientific discoveries What new inventions or scientific discoveries have been made'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,scientific,discovery,invention,new


'443 : US investment Africa What is the extent of US government and private investment in subSaharan Africa'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,private,africa,investment,subsaharan
1,extent,africa,investment,government


'448 : ship losses Identify instances in which weather was a main or contributing factor in the loss of a ship at sea'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,contributing,ship,loss,main
1,sea,ship,loss,weather
2,wa,ship,loss,factor
3,instance,ship,loss,identify


'622 : price fixing Identify companies or corporations that have been accused or indicted of price fixing including the product or type of product involved'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,product,fixing,price,including
1,involved,fixing,price,type
2,accused,fixing,price,company
3,corporation,fixing,price,indicted


'625 : arrests bombing WTC Identify documents that provide information on the arrest andor conviction of the bombers of the World Trade Center WTC in February 1993'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,1993,arrest,wtc,andor
1,bomber,arrest,wtc,bombing
2,center,arrest,wtc,february
3,conviction,arrest,wtc,trade
4,world,arrest,wtc,information
5,provide,arrest,wtc,document


'638 : wrongful convictions Find documents that discuss freed prisoners who have been wrongfully convicted based on faulty forensic evidence poor police work or false testimony'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,based,wrongful,prisoner,convicted
1,testimony,wrongful,prisoner,poor
2,police,wrongful,prisoner,freed
3,forensic,wrongful,prisoner,faulty
4,false,wrongful,prisoner,wrongfully
5,evidence,wrongful,prisoner,work
6,conviction,wrongful,prisoner,discus


'639 : consumer online shopping What factors contributed to the growth of consumer online shopping'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,consumer,online,shopping,contributed
1,factor,online,shopping,growth


'648 : family leave law Identify documents that discuss details of a family leave law such as how long compensation if any for what reason allowed etc'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,law,family,leave,compensation
1,long,family,leave,allowed
2,reason,family,leave,discus
3,document,family,leave,identify


'650 : tax evasion indicted Identify individuals or corporations that have been indicted on charges of tax evasion of more than two million dollars in the US or UK'

Unnamed: 0,single term,pivot t1,pivot t2,t3
0,indicted,evasion,tax,charge
1,dollar,evasion,tax,individual
2,million,evasion,tax,uk
3,corporation,evasion,tax,identify
