In [None]:
import json
import pandas as pd


In [None]:
train_data = 'small.buzztrain.json'

with open(train_data) as f:
            train = json.load(f)

In [None]:
questions = train['questions']

df = pd.DataFrame(questions)

In [None]:
data_df = pd.read_csv('output_data_csv')

data_df.head(5)

Unnamed: 0.1,Unnamed: 0,question,correct_answer,guess,buzz
0,0,.One film produced in this country takes place...,Kingdom of Sweden [or Konungariket Sverige],Battleship_Potemkin,False
1,1,.One film produced in this country takes place...,Kingdom of Sweden [or Konungariket Sverige],David,False
2,2,.One film produced in this country takes place...,Kingdom of Sweden [or Konungariket Sverige],Battleship_Potemkin,False
3,3,.One film produced in this country takes place...,Kingdom of Sweden [or Konungariket Sverige],Martin_Luther,False
4,4,.One film produced in this country takes place...,Kingdom of Sweden [or Konungariket Sverige],Martin_Luther,False
...,...,...,...,...,...
195,195,.One of the most famous cult practices associa...,Artemis,Bear,False
196,196,.One of the most famous cult practices associa...,Artemis,Bear,False
197,197,.One of the most famous cult practices associa...,Artemis,Bear,False
198,198,.One of the most famous cult practices associa...,Artemis,Bear,False


In [None]:
def sentence_count(str):
    return len(str.split("."))

def guess_word_count(str):
    return len(str.split("_"))

def guess_has_paren(str):
    return int("(" in str or ")" in str)

def guess_length(str):
    return len(str)

data_df["sentence_count"]  = data_df["question"].apply(sentence_count)
data_df["guess_word_count"]  = data_df["guess"].apply(guess_word_count)
data_df["guess_has_paren"]  = data_df["guess"].apply(guess_has_paren)
data_df["guess_length"]  = data_df["guess"].apply(guess_length)

In [None]:
from transformers import pipeline

ner_pipeline = pipeline("ner")

def guess_entity(text):
  entities = ner_pipeline(text)
  print("Entities:", entities)
  if len(entities) == 0:
    return ""
  return entities[0]


data_df["guess_entity"] = data_df["guess"].apply(guess_entity)




In [None]:
data_df.head()

Unnamed: 0.1,Unnamed: 0,question,correct_answer,guess,buzz,sentence_count,guess_word_count,guess_has_paren,guess_length,guess_entity
0,0,.One film produced in this country takes place...,Kingdom of Sweden [or Konungariket Sverige],Battleship_Potemkin,False,2,2,0,19,"{'entity': 'I-MISC', 'score': 0.90084857, 'ind..."
1,1,.One film produced in this country takes place...,Kingdom of Sweden [or Konungariket Sverige],David,False,3,1,0,5,"{'entity': 'I-PER', 'score': 0.9964824, 'index..."
2,2,.One film produced in this country takes place...,Kingdom of Sweden [or Konungariket Sverige],Battleship_Potemkin,False,4,2,0,19,"{'entity': 'I-MISC', 'score': 0.90084857, 'ind..."
3,3,.One film produced in this country takes place...,Kingdom of Sweden [or Konungariket Sverige],Martin_Luther,False,5,2,0,13,"{'entity': 'I-PER', 'score': 0.99532074, 'inde..."
4,4,.One film produced in this country takes place...,Kingdom of Sweden [or Konungariket Sverige],Martin_Luther,False,6,2,0,13,"{'entity': 'I-PER', 'score': 0.99532074, 'inde..."


In [None]:
def parse_entity(entity):
  if entity == "":
    return ""
  return entity["entity"]

data_df["guess_entity"] = data_df["guess_entity"].apply(parse_entity)


TypeError: string indices must be integers

In [None]:
dummies = pd.get_dummies(data_df["guess_entity"])

In [None]:
data_df.sort_values(by=['buzz'], ascending=False)

Unnamed: 0.1,Unnamed: 0,question,correct_answer,guess,buzz,sentence_count,guess_word_count,guess_has_paren,guess_length
6839,6839,One porous form of this element is useful for ...,{carbon},Carbon,True,5,1,0,6
5047,5047,.This ruler set up a system of authority and l...,{Charlemagne} [or Charles I; or Charles the {G...,Charlemagne,True,3,1,0,11
5049,5049,.This ruler set up a system of authority and l...,{Charlemagne} [or Charles I; or Charles the {G...,Charlemagne,True,5,1,0,11
5050,5050,.This ruler set up a system of authority and l...,{Charlemagne} [or Charles I; or Charles the {G...,Charlemagne,True,6,1,0,11
5051,5051,.This ruler set up a system of authority and l...,{Charlemagne} [or Charles I; or Charles the {G...,Charlemagne,True,7,1,0,11
...,...,...,...,...,...,...,...,...,...
2442,2442,.While in college she was head of a science an...,Margaret Mead,James_Baldwin,False,4,2,0,13
2441,2441,.While in college she was head of a science an...,Margaret Mead,James_Baldwin,False,3,2,0,13
2440,2440,.While in college she was head of a science an...,Margaret Mead,Cat,False,2,1,0,3
2434,2434,.This school greatly influenced Geido and the ...,Zen Buddhism,Hudson_River_School,False,3,3,0,19


In [None]:
X = data_df[["sentence_count", "guess_word_count", "guess_has_paren", "guess_length"]]
Y = data_df["buzz"]

X = pd.concat([X, dummies], axis=1)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
train, test, train_labels, test_labels = train_test_split(X, Y, test_size=0.2, random_state=0)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
model = LogisticRegression(class_weight='balanced', random_state = 0)
model.fit(train, train_labels)

In [None]:
test_pred = model.predict(test)

accuracy_score(test_labels, test_pred)

0.6074561403508771

In [None]:
coefficients = model.coef_

for coef, feature_name in zip(coefficients[0], X.columns):
    print(f"{feature_name}: {coef}")

sentence_count: 0.27348637152316546
guess_word_count: 0.17981194231629538
guess_has_paren: -1.1791533084966264
guess_length: -0.016520933637106012
: 0.04409446309716384
I-LOC: -0.5859127041006555
I-MISC: 0.31285392317579835
I-ORG: 0.33520477393222003
I-PER: -0.10022708449378315


In [None]:
import pickle

with open('logreg_buzzer_model.pkl', 'wb') as f:
    pickle.dump(model, f)



pickle.load