In [1]:
import csv
import fastText
import nltk
import os
import pandas as pd
import string

nltk.download("stopwords")

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to /home/ano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Setting and defining file paths.
Like setting it up for the input directory from where the input files can be read from, the output directory where the processed files can be stored.
Setting model path for the classifier.

In [2]:
INPUT_DIR = 'data/input_csv/'
OUTPUT_DIR = 'data/preprocessed_csv'
MODEL_PATH = 'model/model_ted.bin'
TRAIN_FILE_PATH = 'data/ted.train'
VALIDATION_FILE_PATH = 'data/ted.valid'

INPUT_FILES = list(map(
    lambda file: os.path.join(INPUT_DIR, file), os.listdir(INPUT_DIR)
))
OUTPUT_FILE = os.path.join(OUTPUT_DIR, 'ted-new.txt')

In [3]:
def data_stemming(data):
    """Perform porter stemming.

    eg:
    "python","pythoner","pythoning","pythoned" will be reduced to "python"
    """
    stemmer = PorterStemmer()

    stemmed_val = data.apply(
        lambda x: ' '.join([stemmer.stem(item) for item in x.split()]))

    return stemmed_val


def data_cleaning(data):
    """Perform punctuation removal and stopwords removal."""
    data = data.str.replace('[{}]'.format(string.punctuation), '')

    stop = stopwords.words('english')

    data = data.apply(lambda x: ' '.join(
        [item.lower() for item in x.split() if item not in stop]))
    data = data_stemming(data)

    return data


def csv_reader():
    """Read input files.

    - prepare output path for respective files.
    - perform punctuation removal.
    - perform stopwords removal.
    - perform stemming for better word matching.
    """
    for file in INPUT_FILES:
        if file != 'data/input_csv/.ipynb_checkpoints':
            output_path = os.path.join(
                OUTPUT_DIR,
                '-'.join(['processed', file.split('/')[-1]])
            )

            # read input file as df
            df = pd.read_csv(file, engine='python')

            df['title'] = data_cleaning(df['title'])

            df.to_csv(output_path, index=False)


csv_reader()

Function to process the csv columnar format to fasttext recognizable format.

In [4]:
def data_to_fasttext():
    """Convert data from input_csv format to fasttext_code format.

    =======
    input_csv:
    =======
    tags            title
    python, os      this problem is crashing my python shell.

    ========
    fasttxt:
    ========
    __label__python __label__os this problem is crashing my python shell.
    """
    final_data_string = ''

    for file in INPUT_FILES:
        if file != 'data/input_csv/.ipynb_checkpoints':
            with open(file) as csvfile:
                file_obj = csv.DictReader(csvfile, delimiter=',')

                for line in file_obj:
                    labels = ' '.join(
                        list(map(
                            lambda tag: '__label__' + tag, line['tags'].strip('[').strip(']').split(','))
                        ))
                    labels = labels + ' ' + line['title'] + '\n'

                    # write resulting one data line to final_data_string
                    final_data_string += labels

    # save resulting string to a text file
    with open(OUTPUT_FILE, 'w') as f:
        f.write(final_data_string)
        
data_to_fasttext()

Fasttext implementation for predicting actual text contents.

In [5]:
class ImplementFastText:
    """Interface to fastText library.

    args:


    - load existing model
    - predict tags
    - calculate test scores

    """

    def __init__(self):
        """Init class vars."""
        self.model = ''
        self.precision = 0.0
        self.recall = 0.0
        self.f_score = 0.0

    def load_model(self):
        """Load pre-trained model."""
        self.model = fastText.load_model(MODEL_PATH)

    def score(self):
        """Get classification scores."""
        self.model_score = self.model.test(VALIDATION_FILE_PATH)

        num_samples = self.model_score[0]
        self.precision = self.model_score[1]
        self.recall = self.model_score[2]

        self.f_score = 2 * ((self.precision * self.recall) /
                            (self.precision + self.recall))

        return {
            'num_samples': num_samples,
            'precision': round(self.precision, 3),
            'recall': round(self.recall, 3),
            'f_score': round(self.f_score, 3)
        }

    def predict(self, question, num_tags=5):
        """Get predicted tags with probability scores."""
        self.result = self.model.predict(question, k=num_tags)

        tags = list(map(
            lambda tag: tag.replace('__label__', ''), self.result[0][0]))
        probability_score = list(map(
            lambda score: round(score, 3), self.result[1][0]))

        predicted_tags = dict(zip(tags, probability_score))
        predicted_tags.pop('')

        return predicted_tags

In [6]:
def get_predictions(question, no_of_tags):
    """Get predicted tags and score of the models.

    By already loading the trained model and invoking fasttext methods.
    """
    ft_obj = ImplementFastText()
    ft_obj.load_model()
    result = ft_obj.predict([question], num_tags=no_of_tags)
    score = ft_obj.score()

    return result, score

In [9]:
get_predictions('Initiating a motivational talk', 7)

({"'business'": 0.041,
  "'entertainment'": 0.023,
  "'children'": 0.012,
  "'data'": 0.009,
  "'activism'": 0.009,
  "'communication'": 0.006},
 {'num_samples': 550, 'precision': 1.0, 'recall': 0.085, 'f_score': 0.157})