<a href="https://colab.research.google.com/github/seb19283/COMP550-Final-Project/blob/main/COMP_550_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import gzip
import pandas as pd
import json
import numpy as np 

# Margot

In [None]:
def run_margot(text):
  data = {'text': text}
  headers = {'content-type': 'application/json'}
  response = requests.post('https://penelope.vub.be/margot-api/track-arguments', json=data, headers=headers)
  return response.text

def margot_to_json(response):
  return json.loads(response)['document']

# IMDb Dataset

In [None]:
# this is processed similar to amazon
IMDB_file = 'https://raw.githubusercontent.com/seb19283/COMP550-Final-Project/main/margot.json'
IMDB = pd.read_json(IMDB_file,orient = 'values',lines=True)
IMDB

# Margot Vec

In [None]:
import numpy as np
# MARGOT Feature Vectorization [avg_claim, avg_evidence, max_claim, max_evidence, pctg_c_over0, pctg_e_over0]

def margot_Vec(dataframe):
  v = np.zeros((len(dataframe),8))
  for i in range(len(dataframe)):
    d = pd.DataFrame(dataframe["margot"][i])
    if d.empty:
      v[i] = [0] * 8
      continue
  
    avg_claim = np.average(d["claim_score"])
    avg_evidence = np.average(d["evidence_score"])
    max_claim = np.max(d["claim_score"])
    max_evidence = np.max(d["evidence_score"])
    num_c_over0 = np.sum(d["claim_score"] > 0)
    num_e_over0 = np.sum(d["evidence_score"] > 0)
    pctg_c_over0 = np.sum(d["claim_score"] > 0) / len(d)
    pctg_e_over0 = np.sum(d["evidence_score"] > 0 ) / len(d)

    v[i] = [avg_claim,avg_evidence,max_claim,max_evidence,num_c_over0,num_e_over0,pctg_c_over0,pctg_e_over0]
  return v 

amazon_vec = margot_Vec(amazon)
IMDB_vec = margot_Vec(IMDB)

In [None]:
import scipy
def appendVec(target, vec):
  return scipy.sparse.hstack((target,vec))

# Model


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# returns encodings of text without margot vec including (BOW,Tfidf,Bernoulli)
def transform_frame(dataframe,textName):
  Count_Vectorizer = CountVectorizer(ngram_range=(2,2),stop_words='english')
  Tfidf_Vectorizer = TfidfVectorizer(ngram_range=(2,2),stop_words='english')
  bayes_Vectorizer = CountVectorizer(ngram_range=(2,2),stop_words='english',binary=True)
  X_bag = Count_Vectorizer.fit_transform(dataframe[textName])
  X_Tfidf = Tfidf_Vectorizer.fit_transform(dataframe[textName])
  X_bayes = bayes_Vectorizer.fit_transform(dataframe[textName])
  return (X_bag,X_Tfidf,X_bayes)

In [None]:
b = transform_frame(IMDB,"review")

In [None]:
# hyper para
IMDB_useful_threshold = 0.5
Y_IMDB = [0 if x < IMDB_useful_threshold else 1 for x in IMDB["helpfulness"]]

In [None]:
ones = 0
zeros = 0

for (i, x) in enumerate(Y_IMDB):
  if x == 1:
    ones += 1
  else:
    zeros += 1

print(ones, zeros)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
import random
import sys
def eval(X,Y,model,random_state):
    x_train,x_test,y_train,y_test = train_test_split(X,Y,train_size=0.8,random_state=random_state) #80% train, 25% test
    model = model().fit(x_train,y_train)
    y_predict = model.predict(x_test)
    f1 = f1_score(y_test,y_predict)
    acc = accuracy_score(y_test, y_predict)
    rec = recall_score(y_test, y_predict)
    prec = precision_score(y_test, y_predict,zero_division=1)
    return f1, acc, rec, prec

def report_and_apply_margot(model,dataset_Name,x,label,margot_vec):

  random_state = random.randint(0,2**32 - 1 )
  print(f"{type(model()).__name__}  on {dataset_Name} margot ONLY. Result(f1,acc,rec,prec) : {eval(margot_vec,label,model,random_state)}")
  print(f"{type(model()).__name__}  on {dataset_Name}. Result(f1,acc,rec,prec) : {eval(x,label,model,random_state)}")
  print(f"{type(model()).__name__}  on {dataset_Name} with margot. Result(f1,acc,rec,prec) : {eval(appendVec(x,margot_vec),label,model,random_state)}")
  

def getResult(encodings,label,dataset_Name,margot_vec):
  X_bag,X_Tfidf,X_bayes = encodings

  report_and_apply_margot(LogisticRegression,dataset_Name+"BOW",X_bag,label,margot_vec)
  report_and_apply_margot(LogisticRegression,dataset_Name+"Tfidf",X_Tfidf,label,margot_vec)
  print()
  report_and_apply_margot(MultinomialNB,dataset_Name+"BOW",X_bag,label,MinMaxScaler().fit_transform(margot_vec))
  report_and_apply_margot(MultinomialNB,dataset_Name+"Tfidf",X_Tfidf,label,MinMaxScaler().fit_transform(margot_vec))
  print()
  report_and_apply_margot(RandomForestClassifier,dataset_Name+"BOW",X_bag,label,margot_vec)
  report_and_apply_margot(RandomForestClassifier,dataset_Name+"Tfidf",X_Tfidf,label,margot_vec)
  print()
  report_and_apply_margot(MLPClassifier,dataset_Name+"BOW",X_bag,label,margot_vec)
  report_and_apply_margot(MLPClassifier,dataset_Name+"Tfidf",X_Tfidf,label,margot_vec)



In [None]:
getResult(b,Y_IMDB,"IMDB",IMDB_vec)