#Text Classification Workflow

Here’s a high-level overview of the workflow used to solve machine learning problems:

*   Step 1: Gather Data
*   Step 2.5: Choose a Model*
*   Step 3: Prepare Your Data
*   Step 4: Build, Train, and Evaluate Your Model
*   Step 5: Tune Hyperparameters
*   Step 6: Deploy Your Model

In [None]:
import os
import numpy as np
import pandas as pd


In [None]:
def _load_and_shuffle_data(data_path,
                           file_name,
                           cols,
                           seed,
                           separator=',',
                           header=0):
    """Loads and shuffles the dataset using pandas.
    # Arguments
        data_path: string, path to the data directory.
        file_name: string, name of the data file.
        cols: list, columns to load from the data file.
        seed: int, seed for randomizer.
        separator: string, separator to use for splitting data.
        header: int, row to use as data header.
    """
    np.random.seed(seed)
    data_path = os.path.join(data_path, file_name)
    data = pd.read_csv(data_path, usecols=cols, sep=separator, header=header)
    return data.reindex(np.random.permutation(data.index))


def _split_training_and_validation_sets(texts, labels, validation_split):
    """Splits the texts and labels into training and validation sets.
    # Arguments
        texts: list, text data.
        labels: list, label data.
        validation_split: float, percentage of data to use for validation.
    # Returns
        A tuple of training and validation data.
    """
    num_training_samples = int((1 - validation_split) * len(texts))
    return ((texts[:num_training_samples], labels[:num_training_samples]),
            (texts[num_training_samples:], labels[num_training_samples:]))
    
def load_tweet_weather_topic_classification_dataset(data_path,
                                                    validation_split=0.2,
                                                    seed=123):
   
    columns = [1] + [i for i in range(13, 28)]  # 1 - text, 13-28 - topics.
    data = _load_and_shuffle_data(data_path, 'train.csv', columns, seed)

    # Get tweet text and the max confidence score for the weather types.
    texts = list(data['tweet'])
    weather_data = data.iloc[:, 1:]

    labels = []
    for i in range(len(texts)):
        # Pick topic with the max confidence score.
        labels.append(np.argmax(list(weather_data.iloc[i, :].values)))

    return _split_training_and_validation_sets(
        texts, np.array(labels), validation_split)

In [None]:
(train_data, train_labels),(test_data, test_labels)=load_tweet_weather_topic_classification_dataset('')