In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import random
import string
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers import Activation, Dense, LSTM
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

youtube_data = pd.read_csv('/kaggle/input/youtube-new/USvideos.csv')
categories = pd.read_json('/kaggle/input/youtube-new/US_category_id.json')

additional_data = pd.read_csv('/kaggle/input/youtube-trending-video-dataset/US_youtube_trending_data.csv')

In [None]:
# Combine datasets & remove all categories except music (id=10) and sport (id=17) 
titles1 = pd.DataFrame({"title": youtube_data.title, "category_id": youtube_data.category_id })
titles2 = pd.DataFrame({"title": additional_data.title, "category_id": additional_data.categoryId})
all_data = titles1.append(titles2).drop_duplicates(subset='title', keep="last")

all_data.groupby("category_id").apply(len) # Number of titles in each category

In [None]:
# Create category dict
category_dict = {}
for i in categories["items"]:
    category_dict[int(i['id'])] = i['snippet']['title']

youtube_data['category'] = youtube_data['category_id'].map(category_dict)

category_dict

# Classifier functions

In [None]:
def get_data_with_categories(data, id1, id2):
    data = data[data["category_id"].isin([id1,id2])]
    data["label"] = (data["category_id"] == id2).astype(int) # id1: label=0, id2: label=1
    data["generated"] = 0
    return data

def train_and_run_classifier(data):
    # Split train x, y and test x, y
    titles = data.title[data.generated == 0].values
    y = data.label[data.generated == 0].values

    titles_train, titles_test, y_train, y_test = train_test_split(titles, y, test_size=0.25, random_state=123)
    
    # Generated data will only be used for train
    titles_train = np.append(titles_train, data.title[data.generated == 1].values)
    y_train = np.append(y_train, data.label[data.generated == 1].values)
    
    # Vectorize train
    vectorizer = CountVectorizer()
    vectorizer.fit(titles_train)

    X_train = vectorizer.transform(titles_train)
    X_test  = vectorizer.transform(titles_test)
    
    # Do logistic regression and report test accuracy
    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    return classifier.score(X_test, y_test)
    

# Word-level generator functions

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
import itertools
import nltk

from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Dropout, TimeDistributed
from keras.layers import LSTM
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint


class KerasBatchGenerator(object):
    def __init__(self, data, num_steps, batch_size, vocabulary, word_to_index, skip_step=5):
        self.data = data
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.vocabulary = vocabulary
        # this will track the progress of the batches sequentially through the
        # data set - once the data reaches the end of the data set it will reset
        # back to zero
        self.current_idx = 0
        # skip_step is the number of words which will be skipped before the next
        # batch is skimmed from the data set
        self.skip_step = skip_step
        
        self.word_to_index = word_to_index

    def generate(self):
        x = np.zeros((self.batch_size, self.num_steps))
        y = np.zeros((self.batch_size, self.num_steps, self.vocabulary))
        while True:
            i = 0
            while i < self.batch_size:
                # I don't want to see in x a title end token to predict y 
                if self.current_idx < len(self.data) and self.data[self.current_idx] == self.word_to_index[title_end_token]:
                    self.current_idx += self.skip_step
                if self.current_idx + self.num_steps >= len(self.data):
                    # reset the index back to the start of the data set
                    self.current_idx = 0
                x[i, :] = self.data[self.current_idx:self.current_idx + self.num_steps]
                temp_y = self.data[self.current_idx + 1:self.current_idx + self.num_steps + 1]
                # convert all of temp_y into a one hot representation
                y[i, :, :] = to_categorical(temp_y, num_classes=self.vocabulary)
                self.current_idx += self.skip_step
                i += 1
            yield x, y
            
            
title_start_token = "TITLE_START"
title_end_token = "TITLE_END"
unknown_token = "UNKNOWN_TOKEN"

def create_and_train_generator(data, category_id, vocabulary_size=250):
    #Tokenize
    titles = data.title[data["category_id"] == category_id].to_numpy()

    sentences = itertools.chain(*[nltk.sent_tokenize(x.lower()) for x in titles])
    tokenized_titles = ["%s %s %s" % (title_start_token, x, title_end_token) for x in sentences]
    tokenized_titles = [nltk.word_tokenize(title) for title in tokenized_titles]
    final_title = []
    word_freq = {}
    for title in tokenized_titles:
        final_title.append([token for token in title if token.isalpha() or token == title_start_token or token == title_end_token])
        for word in title:
            if word in word_freq.keys():
                word_freq[word] += 1
            else:
                word_freq[word] = 1
    tokenized_titles = final_title
    
    # Create vocabulary
    sorted_words = sorted(word_freq, key = word_freq.get, reverse = True)
    index_to_word = sorted_words[:vocabulary_size-1]
    index_to_word.append(unknown_token)
    word_to_index = dict([(word, index) for index, word in enumerate(index_to_word)])

    # Replace non common words with unknown token
    for i, sent in enumerate(tokenized_titles):
        tokenized_titles[i] = [w if w in word_to_index else unknown_token for w in sent]
        
    # Create training data
    num_steps = 1
    skip_step = 1
    batch_size = 10
    train_data_proportion = 0.90

    # Create the training data
    # A concatenation of all tokens as integers (indices)
    X = list(itertools.chain(*np.asarray([[word_to_index[w] for w in sent] for sent in tokenized_titles])))
    
    split_index = round(len(X) * train_data_proportion)
    
    # Create 2 batch generators out of the concatenation
    train_data_generator = KerasBatchGenerator(X[:split_index], num_steps, batch_size, vocabulary_size, word_to_index, skip_step)
    valid_data_generator = KerasBatchGenerator(X[split_index + 1:], num_steps, batch_size, vocabulary_size, word_to_index, skip_step)    
        
    # Create model    
    hidden_size = 250

    model = Sequential()
    model.add(Embedding(vocabulary_size, hidden_size, input_length=num_steps))
    model.add(LSTM(hidden_size, return_sequences=True))
    model.add(LSTM(hidden_size, return_sequences=True))
    model.add(Dropout(rate=0.5))
    model.add(TimeDistributed(Dense(vocabulary_size)))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
    
    # Train model
    num_epochs = 7
    model.fit_generator(train_data_generator.generate(), len(X[:split_index])//(batch_size*num_steps), num_epochs, validation_data=valid_data_generator.generate(), validation_steps=len(X[split_index+1:])//(batch_size*num_steps))
    return model, word_to_index, index_to_word


def generate_one_title(model, word_to_index, index_to_word):
    # We start the sentence with the start token
    new_title = [word_to_index[title_start_token]]
    # Repeat until we get an end token
    while not new_title[-1] == word_to_index[title_end_token]:
        x = np.zeros((1,1))
        x[0, :] = new_title[-1]
        next_word_probs = model.predict(x)[0][0]
        sampled_word = word_to_index[unknown_token]
        # We don't want to sample unknown words
        while sampled_word == word_to_index[unknown_token]:
            next_word_probs = next_word_probs.astype(float)
            next_word_probs /= next_word_probs.sum()
            #next_word_probs = np.asarray(next_word_probs).astype('float64') # To avoid rounding errors
            samples = np.random.multinomial(1, next_word_probs)
            sampled_word = np.argmax(samples)
        new_title.append(sampled_word)
    title_str = [index_to_word[x] for x in new_title[1:-1]]
    return title_str

    
def generate_titles(model, word_to_index, index_to_word, num_titles):
    senten_min_length = 2
    senten_max_length = 8

    new_titles = []
    for i in range(num_titles):
        sent = []
        # We want long sentences, not sentences with one or two words
        while len(sent) < senten_min_length or len(sent) > senten_max_length:
            sent = generate_one_title(model, word_to_index, index_to_word)
        title = " ".join(sent)
        #print(title)
        new_titles.append(title)
        if i % 100 == 0:
            print("Generated", i, "titles")
    
    return new_titles


# Main program

Run cells below to train a classifier for two categories, generate new titles in those categories and train a new classifier with the added generated titles

In [None]:
# Choose which categories to classify
category_id1 = 1
category_id2 = 20


data = get_data_with_categories(all_data, category_id1, category_id2)

score = train_and_run_classifier(data)

# Create models
vocabulary_size = 5
model1, word_to_index1, index_to_word1 = create_and_train_generator(data, category_id1, vocabulary_size)
model2, word_to_index2, index_to_word2 = create_and_train_generator(data, category_id2, vocabulary_size)


In [None]:
generated_titles2

In [None]:
num_titles = 300
print("Generating titles of category", category_id1)
generated_titles1 = generate_titles(model1, word_to_index1, index_to_word1, num_titles)
print("Generating titles of category", category_id2)
generated_titles2 = generate_titles(model2, word_to_index2, index_to_word2, num_titles)

In [None]:
append_titles = list(range(0,num_titles+1,100))
aug_scores = []
for index in append_titles:
    aug_data = data.append(pd.DataFrame({"title": generated_titles1[:index], "category_id": category_id1, "label": 0, "generated": 1}))
    aug_data = aug_data.append(pd.DataFrame({"title": generated_titles2[:index], "category_id": category_id2, "label": 1, "generated": 1}))
    aug_score = train_and_run_classifier(aug_data)
    aug_scores.append(aug_score)
    
print("Accuracy before augmentation:", score)
best_score_index = np.argmax(aug_scores)
print("Best accuracy after augmenting", append_titles[best_score_index], "titles:", aug_scores[best_score_index])
fig = plt.figure(figsize=(15,10))
ax = fig.add_axes([0,0,1,1])
ax.bar([str(elem) for elem in append_titles], aug_scores)

ax.set_ylabel('Test accuracy')
ax.set_title('Accuracy vs. # of augmented titles (Vocabulary size =' + str(vocabulary_size) + ')')
plt.ylim(0.8,0.9)
plt.show()



In [None]:
# Rick and Morty: Why Morty Matters
#['TITLE_START','rick','and','morty','why','morty','UNKNOWN_TOKEN','TITLE_END']
