# Web Science Project

## Random Forest classifier

In [2]:
#Imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

questions = pd.read_csv("data/train_dataset.csv", header=None, encoding="iso-8859-1", sep=";")

# Name columns
questions.columns = ['id', 'question', 'answer', 'topic']

In [4]:
#Music Questions
music = questions[questions['topic'] == 'music']
print(len(music.index))
music.loc[:, 'question_length'] = music.loc[:,'question'].apply(len)
music.loc[:, 'answer_length'] = music.loc[:,'question'].apply(len)
print(np.mean(music['question_length']))
print(np.std(music['question_length']))

3675
155.84244897959184
65.00327103848835


In [5]:
# Sport Questions
sports = questions[questions['topic'] == 'sports']
print(len(sports.index))
sports.loc[:, 'question_length'] = sports.loc[:,'question'].apply(len)
sports.loc[:, 'answer_length'] = sports.loc[:,'question'].apply(len)
print(np.mean(sports['question_length']))
print(np.std(sports['question_length']))

2357
139.90666100975815
50.6033325978995


In [6]:
#Science Questions
science = questions[questions['topic'] == 'science-technology']
print(len(science.index))
science.loc[:, 'question_length'] = science.loc[:,'question'].apply(len)
science.loc[:, 'answer_length'] = science.loc[:,'question'].apply(len)
print(np.mean(science['question_length']))
print(np.std(science['question_length']))

1844
135.76030368763557
53.10247763862941


In [7]:
# Kids Questions
kids = questions[questions['topic'] == 'for-kids']
print(len(kids.index))
kids.loc[:, 'question_length'] = kids.loc[:,'question'].apply(len)
kids.loc[:, 'answer_length'] = kids.loc[:,'question'].apply(len)
print(np.mean(kids['question_length']))
print(np.std(kids['question_length']))

574
133.1567944250871
51.9573312024634


In [8]:
# Game questions
games = questions[questions['topic'] == 'video-games']
print(len(games.index))
games.loc[:, 'question_length'] = games.loc[:,'question'].apply(len)
games.loc[:, 'answer_length'] = games.loc[:,'question'].apply(len)
print(np.mean(games['question_length']))
print(np.std(games['question_length']))

463
118.54211663066954
34.718706988572016


In [43]:
# Classifier Model (Random Forest)

questions['question'] = questions['question'].str.replace('[^\w\s]','')
questions['answer'] = questions['answer'].str.replace('[^\w\s]','')

# 80% , 10%, 10%
train, validate, test = np.split(questions.sample(frac=1), [int(.8 * len(questions)), int(.9 * len(questions))])

y = train[['topic']]

vectorizer = CountVectorizer(analyzer="word")
vectorizer.fit(train['question'], train['answer'])
train_data_features = vectorizer.transform(train['question']).toarray() + vectorizer.transform(train['answer']).toarray()

forest = RandomForestClassifier(n_estimators = 300)
forest = forest.fit(train_data_features, y['topic'])

In [30]:
# Test Data
test_data_features = vectorizer.transform(test['question']).toarray() + vectorizer.transform(test['answer']).toarray()

result = forest.predict(test_data_features)
output = pd.DataFrame(data={"id":test["id"], "topic":result})
output.to_csv('final.csv', index=False)
 # print(output)

In [46]:
from sklearn.metrics import accuracy_score,recall_score,confusion_matrix
# print(accuracy_score(test["topic"],result))

## Neural Classifier

## Crowdsourcing classification 

In [47]:
generated_questions = pd.read_csv("data/sample_crowdsourcing.tsv", encoding="ISO-8859-1", delimiter="\t", na_filter=False)
generated_questions.columns = ['id', 'question', 'answer', 'difficulty', 'opinion', 'factuality']

print(generated_questions.head(5))

generated_questions["opinion"] = pd.to_numeric(generated_questions["opinion"], errors='coerce')
generated_questions["factuality"] = pd.to_numeric(generated_questions["factuality"], errors='coerce')

# generated_questions["opinion"] = generated_questions['opinion'].astype('int')

# grouped_questions = generated_questions.groupby(['question'])

generated_question = generated_questions.groupby('question').filter(lambda x: x['factuality'].sum() < 1)

def majority(arr):
    (values,counts) = np.unique(arr,return_counts=True)
    ind=np.argmax(counts)
    if len(values[counts == counts.max()]) > 1:
        return sum(values[counts == counts.max()])
    else:
        return values[ind]
        

# print(generated_questions.dtypes)
# print(generated_questions.head(7))

# print(generated_questions.groupby(['question'])['difficulty'].agg(majority))

generated_data_features = vectorizer.transform(generated_question['question']).toarray() + vectorizer.transform(generated_question['answer']).toarray()
result = forest.predict(generated_data_features)
output = pd.DataFrame(data={"id":generated_question["id"], "question":generated_question["question"], "answer":generated_question["answer"], "difficulty":generated_question["difficulty"], "opinion":generated_question["opinion"], "factuality":generated_question["factuality"], "topic":result})
output.to_csv('generated.csv',encoding='utf-8', index=False)


    id                                           question  \
0    0  Which food contain more vitamin C; A lemon or ...   
1    0                   What is the largest human organ?   
2    0  How many dwarf planets are in the solar system...   
3    0  In which country were the first Olympic Games ...   
4    0  Which is the only country to have played in ea...   
5    0  Which popular fitness method was invented by a...   
6    0  What is the name of the cartoon where there ar...   
7    0       Who is the worldÃ¢ÂÂs largest land animal?   
8    0                 How many colours are in a rainbow?   
9    0                      Where is the group abba from?   
10   0  Which one direction member left the group in 2...   
11   0  What singer holds the world record for most wo...   
12   0                        What year did Wii come out?   
13   0  What is the highest selling video game console...   
14   0  What is the first female fighting game charact...   
15   0                  