In [None]:
import os
import sys
import re
import codecs
import argparse
import logging
import shutil
import json
from random import shuffle, randint
from datetime import datetime
from collections import namedtuple, OrderedDict
import multiprocessing
from smart_open import open
from tqdm.auto import tqdm
import gensim
import gensim.models.doc2vec
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
import time
import optuna
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
def doc2vec(X_train,y_train,X_test,y_test,svc_c=0,num_epochs=100,alpha=0.015):
  stopwords_path='../utils/stopwords.txt'
  vocab_min_count=5
  vector_size=200
  min_alpha=0.001
  window=5
  negative = 5
  hs = 0
  def read_lines(path):
    return [line.strip() for line in codecs.open(path, "r", "utf-8")]
  def load_stopwords(stopwords_path):
    stopwords = read_lines(stopwords_path)
    return dict(map(lambda w: (w.lower(), ''), stopwords))
  assert gensim.models.doc2vec.FAST_VERSION > - \
        1, "This will be painfully slow otherwise"
  stopwords = load_stopwords(stopwords_path)
  cores = multiprocessing.cpu_count()
  docs=[]
  for i , doc in enumerate(X_train):
    words = doc.replace("\n"," ").replace("।", " ")
    words = re.sub(r'[^\w\s]', " ", words).split()
    words = [w for w in words if w not in stopwords and len(w) > 1]
    tags=[i]
    docs.append(TaggedDocument(words=words, tags=tags))
    model = Doc2Vec(dm=1, dm_mean=1, vector_size=vector_size, window=window, negative=negative, hs=hs,
                    min_count=vocab_min_count, workers=cores)
  vocab_size = len(model.wv.index_to_key)
  model.build_vocab(docs)
  shuffle(docs)
  print("Training")
  model.train(docs, total_examples=len(docs),
              epochs=num_epochs, start_alpha=alpha, end_alpha=min_alpha,report_delay=60)
  Xtr=[]
  for i , doc in enumerate(X_train):
    Xtr.append(model.dv.get_vector(i))
  Xte=[]
  for i , doc in enumerate(X_test):
    words = doc.replace("\n"," ").replace("।", " ")
    words = re.sub(r'[^\w\s]', " ", words).split()
    words = [w for w in words if w not in stopwords and len(w) > 1]
    Xte.append(model.infer_vector(words))
  print("Classifying")
  clf = SVC()
  clf.fit(Xtr, y_train)
  y_pred = clf.predict(Xte)
  print("SVC Accuracy: ",classification_report(y_test, y_pred,output_dict=True)['accuracy'])

  clf = GradientBoostingClassifier()
  clf.fit(Xtr, y_train)
  y_pred = clf.predict(Xte)
  print("GradientBoostingClassifier Accuracy: ",classification_report(y_test, y_pred,output_dict=True)['accuracy'])

In [None]:
input_path = "../data/data.csv"
df=pd.read_csv(input_path)
df['sentences']=df['sentences'].apply(eval)

In [None]:
X=df['sentences'].apply(lambda x: " ".join(x))
y=df['label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
doc2vec(X_train,y_train,X_test,y_test,svc_c=None,num_epochs=100,alpha=0.015)