In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Create Training Set

In [None]:
"""
Credit: John Ketterer
"""
import pandas as pd
import pickle

# chunks were taken from regex of POS tags located on google colab
chunks1 = pickle.load( open('/content/drive/MyDrive/chunks_1.pickle', "rb" ) )
chunks2 = pickle.load( open('/content/drive/MyDrive/chunks_2.pickle', "rb" ) )
chunks3 = pickle.load( open('/content/drive/MyDrive/chunks_3.pickle', "rb" ) )
chunks4 = pickle.load( open('/content/drive/MyDrive/chunks_4.pickle', "rb" ) )

# Sample size is 10% and will be labeled accordingly
# perhaps a sample of a sample can be used depends on NN model
print('Length:', len(chunks1))
print('Length:', len(chunks2))
print('Length:', len(chunks3))
print('Length:', len(chunks4))

def training_set(chunks):
    '''creates a dataframe that easily parsed with the chunks data '''
    df = pd.DataFrame(chunks)
    df.fillna('X', inplace = True)

    train = []
    for row in df.values:
        phrase = ''
        for tup in row:
            # needs a space at the end for seperation
            phrase += tup[0] + ' '
        phrase = ''.join(phrase)
        # could use padding tages but encoder method will provide during
        # tokenizing/embeddings; X can replace paddding for now
        train.append( phrase.replace('X', '').strip())

    df['phrase'] = train

    return df.phrase

def strip_commas(df):
    '''create new series of individual n-grams'''
    grams = []
    for sen in df:
        sent = sen.split(',')
        for word in sent:
            grams.append(word)
    return pd.Series(grams)

c = training_set(chunks4)
separated_chunks4 = strip_commas(c)

# one training corpus with 10% of each POS regex identification
training = pd.concat([training_set(chunks1),
                      training_set(chunks2),
                      training_set(chunks3),
                      separated_chunks4],
                        ignore_index = True )

training.to_csv('/content/drive/MyDrive/training_set.csv')
print("'np_train_skills_no_commas.csv' has been created")

Length: 1418223
Length: 1453992
Length: 707018
Length: 60326
'np_train_skills_no_commas.csv' has been created


## Drop Duplicated

In [None]:
data = pd.read_csv('/content/drive/MyDrive/training_set.csv')

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,0
0,0,background
1,1,a career
2,2,data science
3,3,a positive impact
4,4,others


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3760212 entries, 0 to 3760211
Data columns (total 2 columns):
 #   Column      Dtype 
---  ------      ----- 
 0   Unnamed: 0  int64 
 1   0           object
dtypes: int64(1), object(1)
memory usage: 57.4+ MB


In [None]:
data['0'].duplicated().sum()

3289237

In [None]:
data.rename(columns = {'0' : 'skills'}, inplace=True)
data.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
data.head()

Unnamed: 0,skills
0,background
1,a career
2,data science
3,a positive impact
4,others


In [None]:
data.drop_duplicates(inplace=True)

In [None]:
data

Unnamed: 0,skills
0,background
1,a career
2,data science
3,a positive impact
4,others
...,...
3760203,pharmacy channels
3760204,grocery retailers
3760205,variety discount stores
3760207,speciality beauty retailers


In [None]:
data.to_csv('/content/drive/MyDrive/training_set.csv')