<a href="https://colab.research.google.com/github/siting1206/NLP_HW1/blob/main/Word2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [131]:
%matplotlib inline
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from collections import namedtuple
from tensorflow.keras.preprocessing.sequence import pad_sequences

import regex as re
import os, string, sys

from gensim.models.word2vec import Word2Vec

In [132]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Data Preprocessing

In [133]:
class RegexFeatures(object):
    PATTERNS = {
        "repeatedPunctuation": re.compile(r'^[\.\,!\?"\':;_\-]{2,}$'),
        "isNumber": re.compile(r'^((\p{N}{,2}([,]?\p{N}{3})+)(\.\p{N}+)?)$'),
        "isURL": re.compile(r'^http[s]?://'),
        "isMention": re.compile(r'^(RT)?@[\p{Alnum}_]+$'),
        "isHashtag": re.compile(r'^#\p{Alnum}+$'),
        "isMoney": re.compile(r'^\$((\p{N}{,2}([,]?\p{N}{3})+)(\.\p{N}+)?)$'),
    }
    def __init__(self):
        print("Initialized RegexFeature")
    def process(word):
        features = dict()
        for k, p in RegexFeatures.PATTERNS.iteritems():
            if p.match(word):
                features[k] = True
        return features


Tag = namedtuple("Tag", ["token", "tag"])

def load_sequences(filename, sep="\t", notypes=False, test_data=False):
    sequences = []
    with open(filename) as fp:
        seq = []
        for line in fp:
            line = line.strip()
            if line:
                line = line.split(sep)
                seq.append(Tag(*line))
            else:
                sequences.append(seq)
                seq = []
        if seq:
            sequences.append(seq)
    return sequences

def load_test_sequences(filename, sep="\t"):
    sequences = []
    with open(filename) as fp:
        seq = []
        for line in fp:
          line = line.strip()
          if line != ".":
            seq.append(line)
          else:
            sequences.append(seq)
            seq = []
        if seq:
          sequences.append(seq)
    return sequences

### Load Dataset

In [134]:
train_sequences = load_sequences("drive/MyDrive/NLP_assignment1/data/train.txt", sep="\t", notypes=True)
dev_sequences = load_sequences("drive/MyDrive/NLP_assignment1/data/dev.txt", sep="\t", notypes=False)

test_sequences = load_test_sequences("drive/MyDrive/NLP_assignment1/data/test-submit.txt")

In [135]:
train_sentences = [[t[0] for t in seq] for seq in (train_sequences)]
train_tags = [[t[1] for t in seq] for seq in (train_sequences)]

valid_sentences = [[t[0] for t in seq] for seq in (dev_sequences)]
valid_tags = [[t[1] for t in seq] for seq in (dev_sequences)]

# print(train_sentences)

In [136]:
other_entities = {
    "isHashtag": [],
    "isMention": [],
    "isURL": [],
    "isMoney": [],
    "isNumber": [],
    "repeatedPunctuation": []
}
for seq in train_sentences:
    for t in seq:
        for k in other_entities.keys():
            if RegexFeatures.PATTERNS[k].match(t):
                other_entities[k].append(t)
for k, v in other_entities.items():
    print(k, len(v))

isHashtag 440
isMention 1292
isURL 448
isMoney 5
isNumber 120
repeatedPunctuation 1059


In [137]:
ENTITY_MAPPINGS={k: "__%s__" % k for k in other_entities.keys()}
ENTITY_MAPPINGS

{'isHashtag': '__isHashtag__',
 'isMention': '__isMention__',
 'isURL': '__isURL__',
 'isMoney': '__isMoney__',
 'isNumber': '__isNumber__',
 'repeatedPunctuation': '__repeatedPunctuation__'}

In [138]:
def preprocess_token(x, to_lower=False):
    for k in ENTITY_MAPPINGS.keys():
        if RegexFeatures.PATTERNS[k].match(x):
            return ENTITY_MAPPINGS[k]
    if to_lower:
        x = x.lower()
    return x

#### For the input of LSTM model all the sentences must be padded to same length,for that we must know the maximum length of the sequence in the list of sentences.

In [139]:
train_pre_seq = [[preprocess_token(t[0], to_lower=False) for t in seq] for seq in train_sequences]
test_pre_seq = [[preprocess_token(t, to_lower=False) for t in seq] for seq in test_sequences]
# print(train_pre_seq)

In [168]:
word2vec_sentences = [preprocess_token(t[0], to_lower=False) for seq in train_sequences for t in seq]
tag2vec_sentences = [t[1] for seq in train_sequences for t in seq]
words=list(set(word2vec_sentences))
# print(word2vec_sentences)
tags=list(set(tag2vec_sentences))
# print(tags)
w_index={t:j for j,t in enumerate(words)}
t_index={t:j for j,t in enumerate(tags)}
n_words = len(w_index)
n_tags = len(t_index)
y_train = [[t_index[w[1]] for w in s] for s in train_sequences]
print(len(y_train))

2394


In [141]:
maxl = max([len(s) for s in word2vec_sentences])

print ('Maximum sequence length in the list of sentences:', maxl)

Maximum sequence length in the list of sentences: 36


In [142]:
word2vec_sentences[0]

'__isMention__'

In [143]:
preprocess_token("@guild_gamer")

'__isMention__'

### Word2Vec Model

In [144]:
word2vec = Word2Vec(train_pre_seq, size=50, window=10, sg=1, hs=0, min_count=1, negative=5, workers=1, iter=5)
print(word2vec)



Word2Vec(vocab=8534, size=50, alpha=0.025)


In [160]:
def word_to_vec(word):
  try:
    wordvec = word2vec[word]
  except KeyError as e:
    print(word, "不存在")
    wordvec = np.array([0], * 100)
  return wordvec

X_train = [[[word_to_vec(s[0])] for w in s ]for s in train_pre_seq]
# X_test = [word_to_vec(s[0]) for s in test_pre_seq]
print(X_train[0])
print(y_train[0])
print(len(X_train), len(y_train))

[[array([ 0.17256497, -0.01356004, -0.20986395, -0.3915097 ,  0.10973134,
        0.07411791,  0.35816133,  0.1925026 , -0.17100565,  0.16221847,
       -0.39599484,  0.04705683, -0.5039497 , -0.45943686, -0.38074505,
        0.6735749 ,  0.39461654,  0.6154905 ,  0.04485179,  0.00664062,
        0.45993373, -0.09809875,  0.03269076, -0.06051165, -0.3336815 ,
        0.2445974 , -0.117051  ,  0.5611056 ,  0.30321714, -0.26661155,
        0.25937155, -0.37882864,  0.03031269, -0.09324232, -0.5739206 ,
       -0.38635823,  0.17158705,  0.35059738,  0.25462598,  0.21613239,
        0.44169885,  0.28321284, -0.48630974, -0.11703314, -0.33003965,
       -0.36479118, -0.35736316, -0.29280388,  0.4521079 ,  0.2238732 ],
      dtype=float32)], [array([ 0.17256497, -0.01356004, -0.20986395, -0.3915097 ,  0.10973134,
        0.07411791,  0.35816133,  0.1925026 , -0.17100565,  0.16221847,
       -0.39599484,  0.04705683, -0.5039497 , -0.45943686, -0.38074505,
        0.6735749 ,  0.39461654,  0.6

  This is separate from the ipykernel package so we can avoid doing imports until


In [146]:
print(f"總共收錄了 {len(word2vec.wv.vocab)} 個詞彙")
print("印出 20 個收錄詞彙:")
print(list(word2vec.wv.vocab.keys())[:20])

總共收錄了 8534 個詞彙
印出 20 個收錄詞彙:
['__isMention__', 'they', 'will', 'be', 'all', 'done', 'by', 'Sunday', 'trust', 'me', '*wink*', 'Made', 'it', 'back', 'home', 'to', 'GA', '.', 'It', 'sucks']


In [147]:
word2vec.wv.most_similar("Sunday")

[('please', 0.9987372756004333),
 ('more', 0.9983861446380615),
 ('new', 0.99811851978302),
 ('later', 0.998092532157898),
 ('weekend', 0.9980342984199524),
 ('done', 0.99791020154953),
 ('our', 0.9977797865867615),
 ('awesome', 0.9976701736450195),
 ('meet', 0.9975164532661438),
 ('start', 0.9973775148391724)]

In [148]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [167]:
X_train = pad_sequences(maxlen=maxl, padding='post', sequences=X_train, dtype='float32')
y_train = pad_sequences(maxlen=maxl, padding='post', sequences=y_train)
print("X_train:\n\n", X_train[0])
print("\n\n\n")
print("y_train:\n\n", y_train[0])

X_train:

 [[[ 0.17256497 -0.01356004 -0.20986395 ... -0.29280388  0.4521079
    0.2238732 ]]

 [[ 0.17256497 -0.01356004 -0.20986395 ... -0.29280388  0.4521079
    0.2238732 ]]

 [[ 0.17256497 -0.01356004 -0.20986395 ... -0.29280388  0.4521079
    0.2238732 ]]

 ...

 [[ 0.          0.          0.         ...  0.          0.
    0.        ]]

 [[ 0.          0.          0.         ...  0.          0.
    0.        ]]

 [[ 0.          0.          0.         ...  0.          0.
    0.        ]]]




y_train:

 [13 13 13 13 13 13 13 13 13 13 13 13  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0]
