In [94]:
import pandas as pd
import numpy as np
import os
import re
import string
import unicodedata
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

import warnings 
warnings.filterwarnings('ignore')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


# 1. lower case
def lower_case(text):
    return text.lower()

# 2. remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# 3. remove stop words
def remove_stop_words(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [w for w in word_tokens if not w in stop_words]
    return ' '.join(filtered_text)

# 4. remove numbers
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# 5. remove short words
def remove_short_words(text):
    word_tokens = word_tokenize(text)
    filtered_text = [w for w in word_tokens if len(w) > 2]
    return ' '.join(filtered_text)

# 6. lemmatize
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(text)
    lemmatized_text = [lemmatizer.lemmatize(w) for w in word_tokens]
    return ' '.join(lemmatized_text)

# 8. remove non-ascii characters
def remove_non_ascii(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

# 9. remove extra spaces
def remove_extra_spaces(text):
    return re.sub(' +', ' ', text)

# 16. remove extra non-breaking spaces
def remove_extra_non_breaking_spaces(text):
    return re.sub('\xa0+', '', text)

[nltk_data] Downloading package stopwords to /home/kamal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/kamal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/kamal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [95]:
def process_title(text):
    text = remove_punctuation(text)
    text = remove_stop_words(text)
    text = lower_case(text)
    text = lemmatize(text)
    text = remove_extra_non_breaking_spaces(text)

    return text

def process_keywords(text):
    # split the keywords based on ";"
    tokens = text.split(";")
    tokens = [remove_stop_words(t) for t in tokens]
    tokens = [remove_non_ascii(t) for t in tokens ]

    
    tokens = [t.split("/") for t in tokens]
    tokens = list(itertools.chain.from_iterable(tokens)) 
    
    tokens = [remove_extra_non_breaking_spaces(t) for t in tokens]
    tokens = [lemmatize(t) for t in tokens ]

    tokens = [lower_case(t) for t in tokens]

    return " ".join(tokens) 

def process_target(label):
    label_map={
            "Relevant":1,
            "Not relevant":0
    }
    return label_map[label]

In [96]:
import itertools
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer


class TFIDFVectorizer:
    def __init__(self, **params):
        self.params = params
        
    def vectorize(self, sentences):
        vectorizer = TfidfVectorizer(**self.params)
        vectors = vectorizer.fit_transform(sentences)
        
        return vectors, vectorizer

class PreProcessor:
    def __init__(self, **params):
        self.params = params
        self.vectorizer = TFIDFVectorizer(**params["vectorizer"])
        
        self.train_transform = self.params["train_transform"]
        self.test_transform = self.params["test_transform"]
        
    def serializer(self, object=None, path=None, mode="save"):
        if mode=="save":
            with open(path, 'wb') as handle:
                pickle.dump(object, handle, protocol=pickle.HIGHEST_PROTOCOL)
            return True
            
        if mode=="load":
            with open(path, 'rb') as handle:
                object = pickle.load(handle)
            return object
            
    def __dimension_check(self, ndim):
        if ndim < self.params["vectorizer"]["max_features"]:
            raise Exception (f"""Maximum features is more than number of TFIDF tokens, {ndim} < {self.params["vectorizer"]["max_features"]}""")
    
    def process(self, df, mode):      
        feature_vectors = []

        if mode=="train":

            feature_vectorizers = {}
            
            for column, transform in self.train_transform.items():
                df[column] = df[column].apply(transform)
                
                if column!="target":
                    vecs, vectorizer = self.vectorizer.vectorize(df[column].values)
                    self.__dimension_check(vecs.toarray().shape[1])                    
                    
                    feature_vectors.append(vecs.toarray())
                    feature_vectorizers[column] = vectorizer
                else:
                    y = df[column].values

            if self.serializer(feature_vectorizers, self.params["vectorizer_checkpoint"], "save"):
                print("Vectorizers successfully saved")
                    
        elif mode=="test":
            print("Loading saved vectorizers")
            

            if not os.path.exists(self.params["vectorizer_checkpoint"]):
                raise Exception("Vectorizers missing for test")

            feature_vectorizers = self.serializer(path = self.params["vectorizer_checkpoint"], mode="load")
                
            for column, transform in self.test_transform.items():
                df[column] = df[column].apply(transform)

                if column!="target":
                    vecs = feature_vectorizers[column].transform(df[column].values)
                    self.__dimension_check(vecs.toarray().shape[1])
                    
                    feature_vectors.append(vecs.toarray())
                else:
                    y = df[column].values
        else:
            raise Exception("Preprocessing mode is not identified")
                    
        X = np.stack(feature_vectors, axis=1)
        X = np.reshape(X, (X.shape[0], -1))
        
        return X, y

In [97]:
params = {
    "vectorizer":{
        "max_features":3475, 
        "ngram_range":(1,2)
    },
    "train_transform":{
        "title":process_title,
        "keywords":process_keywords,
        "target":process_target
    },
    "test_transform":{
        "title":process_title,
        "keywords":process_keywords,
        "target":process_target    
    },
    "vectorizer_checkpoint": "vectorizers.pkl"
}

processor = PreProcessor(**params)

In [98]:
data = pd.read_excel("../data/raw/20230821_full_data.xlsx")

test = pd.read_excel("../data/raw/test_ids.xlsx")
test_ids = list(test["id"].values)

train = data[~data["id"].isin(test_ids)]
test = data[data["id"].isin(test_ids)]

In [99]:
train_X, train_y = processor.process(train, mode="train")

Vectorizers successfully saved


In [100]:
test_X, test_y = processor.process(test, mode="test")

Loading saved vectorizers


In [101]:
train_X.shape, train_y.shape, test_X.shape, test_y.shape

((1595, 6950), (1595,), (786, 6950), (786,))

In [102]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(train_X, train_y)

In [103]:
from sklearn.metrics import accuracy_score

def model_accuracy(predictions, actual):
    return accuracy_score(predictions, actual)

clf.score(train_X, train_y)

0.9028213166144201

In [104]:
predictions = clf.predict(test_X)

accuracy = model_accuracy(predictions, test_y)
accuracy

0.7239185750636132

In [105]:
clf.predict(test_X[:2, :])

array([1, 1])

In [130]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=0, criterion="entropy")
clf.fit(train_X, train_y)

In [131]:
clf.score(train_X, train_y)

1.0

In [132]:
predictions = clf.predict(test_X)

accuracy = model_accuracy(predictions, test_y)
accuracy

0.7442748091603053