# Quizmaster

## Classification component

In [1]:
import re

import numpy as np
import pandas as pd
from keras import utils
from keras.layers import Dense, Dropout, Activation
from keras.models import Sequential
from keras.preprocessing import text
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter

def classify_questions():
  labels = sorted(
      ['science-technology', 'for-kids', 'video-games', 'sports', 'music'])

  questions = pd.read_csv("data/train_dataset.csv", header=None,
                          encoding="iso-8859-1", sep=";",names= ['id', 'question', 'answer', 'topic'])

  REPLACE_BY_SPACE = re.compile('[/(){}\[\]|@,;]')
  BAD_SYMBOLS = re.compile('[^0-9a-z #+_]')
  STOPWORDS = set(stopwords.words('english'))


  def clean_text(text):
    text = text.lower()
    text = REPLACE_BY_SPACE.sub(' ', text)
    text = BAD_SYMBOLS.sub(' ', text)
    text = re.sub(r"\'s", " ", text)
    text = ' '.join(word for word in text.split() if
                    word not in STOPWORDS)
    return text

  questions['question'] = questions['question'].apply(clean_text)
  X = questions.question
  y = questions.topic
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                      random_state=42)

  # tokenizer
  max_words = 2000
  tokenize = text.Tokenizer(num_words=max_words, char_level=False)
  tokenize.fit_on_texts(X_train)
  x_train = tokenize.texts_to_matrix(X_train)
  x_test = tokenize.texts_to_matrix(X_test)

  # Encoder
  encoder = LabelEncoder()
  encoder.fit(y_train)
  y_train = encoder.transform(y_train)
  y_test = encoder.transform(y_test)

  num_classes = np.max(y_train) + 1
  y_train = utils.to_categorical(y_train, num_classes)
  y_test = utils.to_categorical(y_test, num_classes)


  batch_size = 64
  epochs = 2

  # Build the model
  model = Sequential()
  model.add(Dense(512, input_shape=(max_words,)))
  model.add(Activation('relu'))
  model.add(Dropout(0.5))
  model.add(Dense(num_classes))
  model.add(Activation('softmax'))
  model.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

  model.fit(x_train, y_train,
            batch_size=batch_size,
            epochs=epochs,
            verbose=1,
            validation_data=(x_test,y_test))

  val_score = model.evaluate(x_train, y_train,
                             batch_size=batch_size, verbose=1)
  score = model.evaluate(x_test, y_test,
                         batch_size=batch_size, verbose=1)
  print('Train acc:', val_score[1])
  print('Test accuracy:', score[1])

  # Classify topics
  generated_questions = pd.read_csv("data/crowdanswers.tsv",
                                    encoding="utf-8", delimiter="\t",
                                    na_filter=False)
  generated_questions.columns = ['id', 'question', 'answer', 'difficulty',
                                 'opinion', 'factuality']

  tokens = generated_questions['question'].apply(clean_text)

  x_predict = tokenize.texts_to_matrix(tokens)
  result = model.predict_classes(x_predict, batch_size=1)
  predicted_labels = [labels[i] for i in result]
  output = pd.DataFrame(data={"id": generated_questions["id"],
                              "question": generated_questions["question"],
                              "answer": generated_questions["answer"],
                              "difficulty": generated_questions["difficulty"],
                              "opinion": generated_questions["opinion"],
                              "factuality": generated_questions["factuality"],
                              "topic": predicted_labels})
  output.to_csv('data/classified.csv', encoding='utf-8', index=False)

Using TensorFlow backend.


In [2]:
classify_questions()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Train on 7130 samples, validate on 1783 samples
Epoch 1/2
Epoch 2/2
Train acc: 0.9611500700259108
Test accuracy: 0.9119461573915273


In [3]:
#get most common item, in case of tie the first
def majority(lst):
  data = Counter(lst)
  return max(lst, key=data.get)

def get_class_questions():
  questions = pd.read_csv("data/classified.csv",
                          encoding="utf-8", sep=",", error_bad_lines=False)
  questions.groupby('question').filter(
      lambda x: x['factuality'].sum() < 1)
  questions = questions.groupby(['question', 'topic'], as_index=False)[
    'difficulty'].agg(majority)
  return questions


def get_class_question(difficulty, questions, topic):
  return questions[
    (questions.topic == topic) & (questions.difficulty == difficulty)]

def classification(difficulty, topic):
  questions = get_class_questions()
  filtered_questions = get_class_question(difficulty, questions, topic)

  for index, row in filtered_questions.iterrows():
    print(row['question'])    

In [4]:
# Get Questions according to difficulty and topic
classification("Medium", "music")

7 rings' is a song by which American singer?
How many strings does a violin have?
In 1995, this company released its first console, which went on to dominate the industry.
In which European city was Swedish pop group ABBA formed in 1972?
This guy had a hit song called Papirsklip - "Bent Mejding"? "Kim Larsen"? "Johnny Madsen"? "Jarl Friis Mikkelsen"?'
What Michael Jackson song had the first Music video on MTV?
What city are the Beatles from?
What city hosted the Beatles as the resident band at the Kaiserkeller and Top Ten Club?
What instrument is primarily identified with rock and roll?
What is Freddie Mercury's (lead singer of Queen) nationality?
What is the Best-Selling Albums in History?
What is the band containing the famous robotic duo called?
What is the famous song called from the movie Titanic? 
What is the first name of Amadeus Mozart?
What is the largest music festival in Budapest, Hungary called?
What is the name of Dexter's annoying sister?
What is the name of Lady Gaga's

## Convergence / Simulator

In [5]:
from collections import Counter
from random import randint

import numpy as np
import pandas as pd

answers = {
  "for-kids": {"false": 0, "true": 0},
  "science-technology": {"false": 0, "true": 0},
  "video-games": {"false": 0, "true": 0},
  "music": {"false": 0, "true": 0},
  "sports": {"false": 0, "true": 0},
}

skipped = []

#get most common item, in case of tie the first
def majority(lst):
  data = Counter(lst)
  return max(lst, key=data.get)

def get_answers():
  return answers

def get_next_question(g):
  return g.sample(n=1, replace=True,
           random_state=randint(0, 3000))

def calc_threshold(threshold, n):
  return np.mean(
      [answers[n].get("false"), answers[n].get("true")]) < threshold

# use answers from file + answers from classified data,since a few questions missing
def give_answer(answer,generated_answer,user_answer,n):
  false_answer2 = generated_answer.strip().lower() != user_answer.strip().lower()
  false_answer = answer.strip().lower() != user_answer.strip().lower()
  if (false_answer & false_answer2):
    answers[n]["false"] = answers[n].get("false") - 1
  else:
    answers[n]["true"] = answers[n].get("true") + 1

def get_conv_questions():
  questions = pd.read_csv("data/classified.csv",
                          encoding="utf-8", sep=",")
  questions = questions.groupby('question').filter(
      lambda x: x['factuality'].sum() < 1)
  questions = questions.drop_duplicates(subset='question', keep="last")
  return questions.groupby(['topic'])


def get_answers_to_questions():
  answers_to_questions = pd.read_csv("data/question_answer.csv",
                                     encoding="utf-8", sep=";")
  answers_to_questions.columns = ['question', 'answer']
  return answers_to_questions

def get_question_answer(answers_to_questions, question):
  return answers_to_questions[
    answers_to_questions["question"] == question][
    "answer"].to_string(index=False)

### Simulated users

In [6]:
import random

def decision(probability=0.9):
  return random.random() < probability

def polymath_user(probability):
  skipped = []
  answers_to_questions = get_answers_to_questions()
  questions = get_conv_questions()
  not_finished = True
  amount_of_questions = 0

  while not_finished:
    for n,g in questions:
      if n not in skipped:
        if calc_threshold(0, n):
          skipped.append(n)
          continue
        question = get_next_question(g)
        answer_generated = question["answer"].to_string(index=False)
        question_string = question.to_string(index=False)

        answer = get_question_answer(answers_to_questions,question_string)
        # hope for the best that the answers are right or available
        if len(skipped) <= 4:
          amount_of_questions += 1
          if decision(probability):
            if not answer_generated:
              user_answer = answer
              give_answer(answer, answer_generated, user_answer, n)
            else:
              user_answer = answer_generated
              give_answer(answer, answer_generated, user_answer, n)
          else:
            give_answer(answer, answer_generated, "False Answer", n)
        if len(skipped) == 4:
          print("Amount of questions answered:", amount_of_questions)
          print("Finished topic was:", n)
          not_finished = False


def topic_expert_user(probability, topic):
  skipped = []
  answers_to_questions = get_answers_to_questions()
  questions = get_conv_questions()
  not_finished = True
  amount_of_questions = 0

  while not_finished:
    for n,g in questions:
      if n not in skipped:
        if calc_threshold(-1, n):
          skipped.append(n)
          continue
        question = get_next_question(g)
        answer_generated = question["answer"].to_string(index=False)
        question_string = question.to_string(index=False)

        answer = get_question_answer(answers_to_questions,question_string)
        # hope for the best that the answers are right or available
        if n == topic:
          if len(skipped) <= 4:
            if decision(probability):
              amount_of_questions +=1
              if not answer_generated:
                user_answer = answer
                give_answer(answer, answer_generated, user_answer, n)
              else:
                user_answer = answer_generated
                give_answer(answer, answer_generated, user_answer, n)
            else:
              give_answer(answer, answer_generated, "False Answer", n)
        else:
          give_answer(answer, answer_generated, "False Answer", n)
        if len(skipped) == 4:
          print("Amount of questions answered:", amount_of_questions)
          print("Finished Topic was:", n)
          not_finished = False

In [7]:
## Can end up running quite long according to probabilty of answering questions right
polymath_user(0.5)
topic_expert_user(0.8, "music")

Amount of questions answered: 19
Finished topic was: science-technology
Amount of questions answered: 0
Finished Topic was: science-technology


## Friend Recommender

In [8]:
import nltk
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn.feature_extraction.text import TfidfVectorizer

def takeCorrelation(elem):
  return elem[1]

STEMMER = nltk.stem.porter.PorterStemmer()

def get_friends_questions():
  return pd.read_csv("data/classified.csv",
                     encoding="utf-8", sep=",")

def stem_tokens(tokens, stemmer=STEMMER):
  return [stemmer.stem(item) for item in tokens]


def tokenizer(text):
  tokens = nltk.word_tokenize(text)
  return stem_tokens(tokens)

def euclidean_distance(x, y):
  return np.sqrt(np.sum((x - y) ** 2))


def get_recommendations_feature(cutoff_k, questions, user_id):
  print("Feature based recommendation")
  tfidf = TfidfVectorizer(tokenizer=tokenizer, stop_words='english')
  tfs = tfidf.fit_transform(questions['question'])
  # add column for vector
  questions['tfsvector'] = list(tfs.toarray())
  user_features = questions[questions['id'] == int(user_id)]['tfsvector'].mean()
  # the questions which a user likes
  questions = questions.groupby(['id'])
  friends_questions = []
  for n, g in questions:
    g = g[g['opinion'] > 1]['tfsvector'].mean()
    friends_questions.append((n, euclidean_distance(user_features, g)))
  friends_questions.sort(key=takeCorrelation, reverse=True)
  for friend in friends_questions[:int(cutoff_k)]:
    print(friend)


def get_recommendations_topic(cutoff_k, questions, user_id):
  friends = []
  print("Topic based recommendation:")
  user_features = questions[questions['id'] == int(user_id)].groupby(['topic'])[
    'opinion'].mean()
  questions = questions.groupby(['id'])
  for n, g in questions:
    if n == int(user_id):
      continue
    # when classification not works properly then some users have only 4 topics
    if len(g.groupby(['topic'])['opinion'].mean()) < 5:
      continue
    corr, p_value = pearsonr(user_features,
                             g.groupby(['topic'])['opinion'].mean())
    friends.append((n, corr))
  friends.sort(key=takeCorrelation, reverse=True)
  for friend in friends[:int(cutoff_k)]:
    print(friend)

In [9]:
# cutoff, questions, user_id
get_recommendations_feature(5,get_friends_questions(),3)

Feature based recommendation


  'stop_words.' % sorted(inconsistent))


(4, 0.36076689057979405)
(11, 0.3386440771631049)
(27, 0.3282917165082846)
(24, 0.30563175815672294)
(31, 0.297454486813683)


In [10]:
# cutoff, questions, user_id
get_recommendations_topic(5,get_friends_questions(),4)

Topic based recommendation:
(18, 0.9777715853271727)
(34, 0.9195235405587819)
(11, 0.9146789084322638)
(36, 0.913861921714508)
(12, 0.7790063508395929)


## User Preference

In [11]:
def get_users_questions():
  questions = pd.read_csv("data/classified.csv",
                          encoding="utf-8", sep=",")
  return questions

def get_preferance_id(questions, userid):
  user_questions = questions[questions['id'] == int(userid)]
  return user_questions


def get_prefereance_filtered_qustions(questions, user_preferance):
  questions = questions[questions['topic'] == user_preferance].head(10)
  return questions


def get_user_preferance(user_questions):
  user_questions = user_questions.groupby('topic')['opinion'].mean()
  user_preferance = user_questions.idxmax()
  return user_preferance

In [12]:
# Get favourite Topic according to user_id
user_questions = get_preferance_id(get_users_questions(),5)
user_preferance = get_user_preferance(user_questions)
get_prefereance_filtered_qustions(get_users_questions(),user_preferance)

Unnamed: 0,id,question,answer,difficulty,opinion,factuality,topic
9,0,For which song did Toy Story get nominated to ...,We belong together,Hard,3,0,music
10,0,"At what age did Jimmy Hendrix, Janis Joplin an...",27,Easy,3,0,music
12,0,What's the name of the most famous Italian plu...,Mario and Luigi,Easy,3,0,music
24,0,What was the Beatles biggest selling single?,She Loves you,Medium,3,0,music
25,0,What was George Michaels first solo hit called?,Careless Whisper,Medium,3,0,music
26,0,In which decade was Dolly Parton born?,1940's,Hard,3,0,music
37,0,Which famous Disney movie features a song call...,Lion King,Easy,3,0,music
38,0,What is the name of the toy cowboy in Toy Story?,Woody,Easy,3,0,music
39,0,Freddie Mercury was the lead vocalist of which...,Queen,Easy,3,0,music
40,0,7 rings' is a song by which American singer?,Ariana Grande,Easy,3,0,music


## Difficulty / Simulator

In [13]:
answered = {"Easy": False, "Medium": False, "Hard": False}

# get most common item, in case of tie the first
def majority(lst):
  data = Counter(lst)
  return max(lst, key=data.get)

def prepare_diff():
  questions = pd.read_csv("data/classified.csv",
                          encoding="utf-8", sep=",")
  answers_to_questions = pd.read_csv("data/question_answer.csv",
                                     encoding="utf-8", sep=";")
  answers_to_questions.columns = ['question', 'answer']
  questions = questions.groupby('question').filter(
      lambda x: x['factuality'].sum() < 1)
  grouped_questions = questions.groupby(['question', 'topic'], as_index=False)[
    'difficulty'].agg(majority)
  questions = questions.drop_duplicates(subset='question', keep="last")
  return answers_to_questions, grouped_questions, questions


def sample_question(questions_to_answer):
  return questions_to_answer.sample(n=1, replace=True,
                                    random_state=randint(0, 10000))


def check_answer(answer, answered, generated_answer, user_answer):
  true_answer2 = generated_answer.strip().lower() == user_answer.strip().lower()
  true_answer = answer.strip().lower() == user_answer.strip().lower()
  if (true_answer) or (true_answer2):
    if answered["Medium"] == True:
      answered["Hard"] = True
    if answered["Easy"] == True:
      answered["Medium"] = True
    if answered["Easy"] == False:
      answered["Easy"] = True


def get_answers_to_questions_diff(answers_to_questions, question):
  return answers_to_questions[
    answers_to_questions["question"] == question][
    "answer"].to_string(index=False)


def get_answered_diff():
  return answered


def get_questions_to_answer(answered, questions):
  questions_to_answer = questions[questions['difficulty'] == "Easy"]
  if answered["Easy"] == True:
    questions_to_answer = questions[questions['difficulty'] == "Medium"]
  if answered["Medium"] == True:
    questions_to_answer = questions[questions['difficulty'] == "Hard"]
  return questions_to_answer

### Simulator

In [14]:
def diffuculty_user(probability):
  answers_to_questions, grouped_questions, questions = prepare_diff()

  answered = get_answered_diff()

  not_finished = True

  while not_finished:
    questions_to_answer = get_questions_to_answer(answered, grouped_questions)

    random_topic_question = sample_question(questions_to_answer)

    print("Difficulty:", random_topic_question["difficulty"].to_string(index=False))
    question = random_topic_question["question"].to_string(index=False)


    generated_answer = questions[questions['question'] == question]['answer'].to_string(index=False)

    answer = get_answers_to_questions_diff(answers_to_questions, question)

    print(question)
    if decision(probability):
      if not generated_answer:
        user_answer = answer
        check_answer(answer, answered, generated_answer, user_answer)
      else:
        user_answer = generated_answer
        check_answer(answer, answered, generated_answer, user_answer)
    else:
      check_answer(answer, answered, generated_answer, "Wrong!!!")
    answered = get_answered_diff()
    if answered["Hard"] == True:
      print("Finished arrived at hard questions")
      not_finished = False

In [15]:
# Probability of answering a question right
diffuculty_user(0.3)

Difficulty: Easy
What is a baby dog called?
Difficulty: Medium
Who invented the telephone?
Difficulty: Medium
Who is the latest official Disney princess?
Difficulty: Hard
Which console has sold the most
Difficulty: Hard
In which year did IT Act came into force in In...
Difficulty: Hard
Who won the Turing award in 2005
Difficulty: Hard
What is the name of the purple Teletubby?
Finished arrived at hard questions
