### Import Packages

In [1]:
#@title Imports

!pip install pydot --quiet
!pip install gensim==3.8.3 --quiet
!pip install tensorflow-datasets --quiet
!pip install -U tensorflow-text==2.8.2 --quiet
!pip install transformers --quiet
!pip install pydot --quiet
!pip install tensorflow_addons --quiet

[K     |████████████████████████████████| 24.2 MB 19.4 MB/s 
[K     |████████████████████████████████| 4.9 MB 5.0 MB/s 
[K     |████████████████████████████████| 4.4 MB 5.0 MB/s 
[K     |████████████████████████████████| 6.6 MB 34.2 MB/s 
[K     |████████████████████████████████| 596 kB 36.6 MB/s 
[K     |████████████████████████████████| 101 kB 9.1 MB/s 
[K     |████████████████████████████████| 1.1 MB 4.9 MB/s 
[?25h

In [2]:
# Import packages
import pandas as pd
import numpy as np
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds
import tensorflow_text as tf_text
import tensorflow_addons as tfa

from google.colab import drive

from collections import Counter
import matplotlib.pyplot as plt
from nltk.util import ngrams

from transformers import BertTokenizer, TFBertModel
from tqdm.notebook import tqdm
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
import tensorflow as tf


import time
from transformers import create_optimizer

import sklearn as sk
import os
import nltk
from nltk.corpus import reuters
from nltk.data import find

import re

import gensim
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertModel


In [3]:
pd.set_option("display.max_rows", None, # display all rows
              "display.max_columns", None, # display all columns
              "display.max_colwidth", None, # expand column width
              "display.html.use_mathjax", False) # disable Latex style mathjax rendering

In [4]:
drive.mount('/drive') 
%cd /drive/MyDrive/W266 Project/Colab Notebooks/Exploration
!pwd

Mounted at /drive
/drive/MyDrive/W266 Project/Colab Notebooks/Exploration
/drive/MyDrive/W266 Project/Colab Notebooks/Exploration


In [5]:
t = pd.read_csv('../../data/transformed/final/test.csv').sample(frac = 1, 
                                                                  random_state = 2) # shuffle rows

In [6]:
len(t)

63978

In [7]:
train = pd.read_csv('../../data/transformed/final/train.csv').sample(frac = 1, 
                                                                          random_state = 2) # shuffle rows
val_test = pd.read_csv('../../data/transformed/final/test.csv').sample(frac = 1, 
                                                                  random_state = 2) # shuffle rows

In [8]:
# up until this point - the biggest set of data we used was 100k (with an 80/20 train/val split). 
# So we need to make sure we retain the 20k test observations we've been using as val
# Luckily, we've been using random seeds, so when we sample again - we're assured it's the same val

min_val = val_test.sample(n = int(100000 * .2),
                          random_state = 2)

In [10]:
# This means that the remaining ~43k observations can be split between val or test, without worrying about contamination
# We will select all of the remaining indices 
val_index = min_val.index
total_index = val_test.index
remain_index = total_index.difference(val_index, sort = False)

In [11]:
print(len(val_index), len(total_index), len(remain_index)) # quick check on totals

20000 63978 43978


In [12]:
# now we can isolate the remaining data that can be split between val and test
remain_val_test = val_test.loc[remain_index]

# Check to make sure none of these indices show up in val
rvt_index = remain_val_test.index

a = rvt_index.intersection(val_index, sort = False)
b = val_index.intersection(rvt_index, sort = False)
print(f'intersection between val and our isolated set is {len(a)+len(b)} indices long')

intersection between val and our isolated set is 0 indices long


In [21]:
# Now, we need to split the remainder of the saved off data to an 80/20 train and 80/20 val split

# print(f'We have {len(total_index)} test/val observations, and {len(val_index)} of them are already in train. \n\
#       That means for an 80/20 split with {int(len(total_index)*.8)} total obs, we need {int(len(total_index)*.8) - len(val_index)}\n\
#       more observations in train')

print(f'We have {len(total_index)} test/val observations, and {len(val_index)} of them are already in val')
print(f'With a train size of {len(train)}, we need {int((len(train)/.8) - len(train))} in both val and test to achieve 80/20 splits')
print(f'With only {len(total_index)} total observations in test/val, we are {int((len(train)/.8) - len(train) - len(total_index)/2)} observations short in both val and test to achieve 80/20 splits')
print(f'We will do the best we can here, and hold {int(len(total_index)/2)} for train and val each.')
print(f'That means that since val already has {len(val_index)} obs, we need to add {int(len(total_index)/2 - len(val_index))} more')

We have 63978 test/val observations, and 20000 of them are already in val
With a train size of 159571, we need 39892 in both val and test to achieve 80/20 splits
With only 63978 total observations in test/val, we are 7903 observations short in both val and test to achieve 80/20 splits
We will do the best we can here, and hold 31989 for train and val each.
That means that since val already has 20000 obs, we need to add 11989 more


In [28]:
val_plus = remain_val_test.sample(n = 11989,
                              random_state = 2)
val_plus_index = val_plus.index
test_index = rvt_index.difference(val_plus_index, sort = False)

# Test to to make sure no intersection
test_index.intersection(val_plus_index, sort = False)
val_plus_index.intersection(test_index, sort = False)

test_index.intersection(val_index, sort = False)
val_index.intersection(test_index, sort = False)

Int64Index([], dtype='int64')

In [34]:
# Isolate test
test = remain_val_test.loc[test_index]

# Finally add min_train and train_plus
val = pd.concat([min_val, val_plus])

# Now, we can write each of these back to CSV for final modeling
train.to_csv('../../data/transformed/final/full_data/train_final.csv', index=False)
test.to_csv('../../data/transformed/final/full_data/test_final.csv', index=False)
val.to_csv('../../data/transformed/final/full_data/val_final.csv', index=False)