In [64]:
import pandas as pd
import numpy as np
import nltk
import scipy
from nltk import word_tokenize
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
nltk.download('punkt')
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score, precision_recall_fscore_support

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Reading Data

In [65]:
def read_data(file_name) -> pd.DataFrame:
  return pd.read_csv(file_name)

# Feature Generation

In [66]:
class NameFeatureGenerator:

  def __init__(self, names: np.array):
    tokenized_words = self.tokenize(names)
    self.vectorizer = TfidfVectorizer(ngram_range = (3,4), max_features = 1000, analyzer = "char") 
    self.vectorizer.fit(tokenized_words)
    # print(self.vectorizer.get_feature_names_out())

  def generate_features(self, names: np.array) -> np.ndarray:
    tokenized_words = self.tokenize(names)
    return self.vectorizer.transform(tokenized_words) 

  def tokenize(self, names: np.array):
    tokenized_words = [''.join(word.split("_")).lower() for word in names]
    return tokenized_words

In [67]:
class EnumFeatureGenerator:
  def __init__(self, property_name, values: np.array):
    self.property_name = property_name
    self.encoder = LabelBinarizer()
    self.encoder.fit(values)
                     
  def generate_features(self, values: np.array):
    return self.encoder.transform(values)

  def get_label_from_features(self, features: np.array):
    return self.encoder.inverse_transform(features)


In [68]:
class LabelFeatureGenerator:
  def __init__(self, property_name, values: np.array):
    self.property_name = property_name
    self.encoder = LabelEncoder()
    self.encoder.fit(values)
                     
  def generate_features(self, values: np.array):
    return self.encoder.transform(values)

  def get_label_from_features(self, features: np.array):
    return self.encoder.inverse_transform(features)

# Preprocessing and Transformations

In [69]:
def get_all_features(input_data: pd.DataFrame):
  names_arr = np.array(df["Name"])
  # print(names_arr.shape)
  name_feature_gen = NameFeatureGenerator(names_arr)
  name_features = name_feature_gen.generate_features(names_arr)
  # print("Name features shape")
  # print(name_features.shape)
  # print("Name features")
  # print(name_features)
  # print(type(name_features))

  datatype_feature_gen = EnumFeatureGenerator("Datatype", np.array(df["Datatype"]))
  dt_features = datatype_feature_gen.generate_features(np.array(df["Datatype"]))
  # print("Datatype features shape")
  # print(dt_features.shape)
  # print(type(dt_features))
  # print("Datatype features")
  # print(dt_features)

  orig_values = datatype_feature_gen.get_label_from_features(dt_features)
  # print("orig_values are")
  # print(orig_values)

  merged_features = scipy.sparse.hstack((name_features, dt_features)).tocsr()
  # print(merged_features.shape)
  # print(type(merged_features))
  # print("Merged features")
  # print(merged_features)

  glossary_labelizer = LabelFeatureGenerator("Glossary", np.array(df["Glossary"]))
  glossary_labels = glossary_labelizer.generate_features(np.array(df["Glossary"]))
  print("Glossary labels shape")
  # print(glossary_labels.shape)
  # print(type(glossary_labels))
  # print(glossary_labels)
  return merged_features, glossary_labels

In [80]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.svm import SVC

In [71]:
df = read_data("sample_data/sample_dataset.csv")

X_features, Y_output = get_all_features(df)

print("All features shape")
print(X_features.shape)

print("Output labels are")
print(Y_output.shape)

Glossary labels shape
All features shape
(200, 244)
Output labels are
(200,)


# Defining Models

In [164]:
def train_model_ada_boost(X_features, Y_output):
  X_train, X_test, y_train, y_test = train_test_split(X_features, Y_output, test_size = 0.3)
  # print("X_train shape:", X_train.shape, " Y_train shape:", y_train.shape)
  # print("X_test shape:", X_test.shape, " Y_test shape:", y_test.shape)

  adaBoostClf = AdaBoostClassifier(n_estimators=100)
  adaBoostClf.fit(X_train, y_train)

  predictions = adaBoostClf.predict(X_test)
  # print(predictions)
  return y_test, predictions

In [259]:
def train_model_one_v_all(X_features, Y_output):
  X_train, X_test, y_train, y_test = train_test_split(X_features, Y_output, test_size = 0.2)
  # print("X_train shape:", X_train.shape, " Y_train shape:", y_train.shape)
  # print("X_test shape:", X_test.shape, " Y_test shape:", y_test.shape)

  adaBoostClf = OneVsRestClassifier(SVC())
  adaBoostClf.fit(X_train, y_train)

  predictions = adaBoostClf.predict(X_test)
  # print(predictions)
  return y_test, predictions

# AdaBoost Classifier

In [165]:
original_answer, predicted_answer = train_model_ada_boost(X_features, Y_output)

In [166]:
multilabel_confusion_matrix(original_answer, predicted_answer)

array([[[37,  0],
        [14,  9]],

       [[39,  0],
        [ 0, 21]],

       [[30, 14],
        [ 0, 16]]])

In [167]:
original_answer

array([1, 2, 0, 0, 0, 1, 0, 0, 2, 0, 1, 0, 1, 2, 2, 2, 0, 1, 0, 1, 0, 0,
       0, 1, 2, 1, 2, 2, 1, 2, 0, 2, 1, 0, 2, 2, 1, 0, 1, 1, 0, 0, 0, 2,
       1, 0, 1, 1, 0, 0, 2, 2, 0, 1, 1, 0, 1, 1, 2, 1])

In [168]:
predicted_answer

array([1, 2, 2, 2, 0, 1, 2, 2, 2, 0, 1, 2, 1, 2, 2, 2, 0, 1, 0, 1, 0, 2,
       0, 1, 2, 1, 2, 2, 1, 2, 0, 2, 1, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2,
       1, 2, 1, 1, 2, 2, 2, 2, 0, 1, 1, 0, 1, 1, 2, 1])

In [169]:
precision_score(original_answer, predicted_answer, average="macro")

0.8444444444444444

In [170]:
precision_recall_fscore_support(original_answer, predicted_answer, average="macro")

(0.8444444444444444, 0.7971014492753623, 0.7527173913043478, None)

# OneVsRest Multi Class Classifier with Support Vector Classifier

In [284]:
original_answer, predicted_answer = train_model_one_v_all(X_features, Y_output)

In [285]:
multilabel_confusion_matrix(original_answer, predicted_answer)

array([[[20,  0],
        [ 2, 18]],

       [[31,  0],
        [ 0,  9]],

       [[27,  2],
        [ 0, 11]]])

In [286]:
original_answer

array([0, 2, 2, 0, 2, 1, 1, 2, 0, 0, 0, 2, 0, 1, 2, 0, 2, 0, 2, 1, 2, 1,
       0, 0, 1, 1, 0, 0, 2, 1, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0])

In [287]:
predicted_answer

array([0, 2, 2, 0, 2, 1, 1, 2, 0, 0, 0, 2, 0, 1, 2, 0, 2, 0, 2, 1, 2, 1,
       0, 0, 1, 1, 2, 0, 2, 1, 1, 0, 0, 0, 0, 2, 0, 2, 0, 0])

In [288]:
precision_score(original_answer, predicted_answer, average="macro")

0.9487179487179488

In [289]:
precision_recall_fscore_support(original_answer, predicted_answer, average="macro")

(0.9487179487179488, 0.9666666666666667, 0.9546783625730995, None)