# Web Science Project

## Random Forest classifier

In [2]:
#Imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords

questions = pd.read_csv("data/train_dataset.csv", header=None, encoding="iso-8859-1", sep=";")

# Name columns
questions.columns = ['id', 'question', 'answer', 'topic']

In [2]:
#Music Questions
music = questions[questions['topic'] == 'music']
print(len(music.index))
music.loc[:, 'question_length'] = music.loc[:,'question'].apply(len)
music.loc[:, 'answer_length'] = music.loc[:,'question'].apply(len)
print(np.mean(music['question_length']))
print(np.std(music['question_length']))

3675
155.84244897959184
65.00327103848835


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [3]:
# Sport Questions
sports = questions[questions['topic'] == 'sports']
print(len(sports.index))
sports.loc[:, 'question_length'] = sports.loc[:,'question'].apply(len)
sports.loc[:, 'answer_length'] = sports.loc[:,'question'].apply(len)
print(np.mean(sports['question_length']))
print(np.std(sports['question_length']))

2357
139.90666100975815
50.6033325978995


In [4]:
#Science Questions
science = questions[questions['topic'] == 'science-technology']
print(len(science.index))
science.loc[:, 'question_length'] = science.loc[:,'question'].apply(len)
science.loc[:, 'answer_length'] = science.loc[:,'question'].apply(len)
print(np.mean(science['question_length']))
print(np.std(science['question_length']))

1844
135.76030368763557
53.10247763862941


In [5]:
# Kids Questions
kids = questions[questions['topic'] == 'for-kids']
print(len(kids.index))
kids.loc[:, 'question_length'] = kids.loc[:,'question'].apply(len)
kids.loc[:, 'answer_length'] = kids.loc[:,'question'].apply(len)
print(np.mean(kids['question_length']))
print(np.std(kids['question_length']))

574
133.1567944250871
51.9573312024634


In [6]:
# Game questions
games = questions[questions['topic'] == 'video-games']
print(len(games.index))
games.loc[:, 'question_length'] = games.loc[:,'question'].apply(len)
games.loc[:, 'answer_length'] = games.loc[:,'question'].apply(len)
print(np.mean(games['question_length']))
print(np.std(games['question_length']))

463
118.54211663066954
34.718706988572016


In [7]:
# Classifier Model (Random Forest)

# 80% , 10%, 10%
train, validate, test = np.split(questions.sample(frac=1), [int(.8 * len(questions)), int(.9 * len(questions))])

train['question'] = train['question'].str.replace('[^\w\s]','')
train['answer'] = train['answer'].str.replace('[^\w\s]','')

y = train[['topic']]

vectorizer = CountVectorizer(analyzer="word")
vectorizer.fit(train['question'], train['answer'])
train_data_features = vectorizer.transform(train['question']).toarray()

forest = RandomForestClassifier(n_estimators = 300)
forest = forest.fit(train_data_features, y['topic'])

KeyboardInterrupt: 

In [None]:
# Test Data
test_data_features = vectorizer.transform(test['question']).toarray() + vectorizer.transform(test['answer']).toarray()

result = forest.predict(test_data_features)
output = pd.DataFrame(data={"id":test["id"], "topic":result})
output.to_csv('final.csv', index=False)
 # print(output)

In [None]:
from sklearn.metrics import accuracy_score,recall_score,confusion_matrix
# print(accuracy_score(test["topic"],result))

## Neural Classifier

In [None]:
from sklearn.preprocessing import LabelEncoder

import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(train['question'])

X_train = tokenizer.texts_to_matrix(train['question'])
X_test = tokenizer.texts_to_matrix(test['question'])

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

from keras.preprocessing.sequence import pad_sequences

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)



from keras.models import Sequential
from keras import layers

max_words = 100

model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()


encoder = LabelEncoder()
encoder.fit(train["topic"])
y_train = encoder.transform(train["topic"])
y_test = encoder.transform(test["topic"])



num_classes = 5
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

model.fit(X_train, y_train, epochs=2, batch_size=128)

loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))


## Crowdsourcing classification 

In [None]:
generated_questions = pd.read_csv("data/sample_crowdsourcing.tsv", encoding="ISO-8859-1", delimiter="\t", na_filter=False)
generated_questions.columns = ['id', 'question', 'answer', 'difficulty', 'opinion', 'factuality']

print(generated_questions.head(5))

generated_questions["opinion"] = pd.to_numeric(generated_questions["opinion"], errors='coerce')
generated_questions["factuality"] = pd.to_numeric(generated_questions["factuality"], errors='coerce')

# generated_questions["opinion"] = generated_questions['opinion'].astype('int')

# grouped_questions = generated_questions.groupby(['question'])

generated_question = generated_questions.groupby('question').filter(lambda x: x['factuality'].sum() < 1)

def majority(arr):
    (values,counts) = np.unique(arr,return_counts=True)
    ind=np.argmax(counts)
    if len(values[counts == counts.max()]) > 1:
        return sum(values[counts == counts.max()])
    else:
        return values[ind]
        

# print(generated_questions.dtypes)
# print(generated_questions.head(7))

#print(generated_questions.groupby(['question'])['difficulty'].agg(majority))

generated_data_features = vectorizer.transform(generated_question['question']).toarray()
result = forest.predict(generated_data_features)
output = pd.DataFrame(data={"id":generated_question["id"], "question":generated_question["question"], "answer":generated_question["answer"], "difficulty":generated_question["difficulty"], "opinion":generated_question["opinion"], "factuality":generated_question["factuality"], "topic":result})
output.to_csv('generated.csv',encoding='utf-8', index=False)


FileNotFoundError: File b'data/classified.csv' does not exist