In [1]:
import pandas as pd
import numpy as np
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from bert_embedding import BertEmbedding
from nltk.corpus import wordnet
import sqlite3
from sqlite3 import Error
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.feature_selection import SelectKBest 
from sklearn.feature_selection import chi2 
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pickle

Using TensorFlow backend.


In [2]:
# reading the .csv file
#df = pd.read_csv("mbti.csv")
#posts = df.posts

In [3]:
# Split words
def split_words(posts):
    tokenized_posts = []
    for row in posts:
        token = word_tokenize(row)
        if token != '':
            tokenized_posts.append(token)
#    print('tokenized posts', tokenized_posts)
    return tokenized_posts

In [4]:
def get_word_num(posts):
    return len(tokenize(posts))

In [None]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [5]:
def lemmatize(tokenized_posts):
    lemmatizer = WordNetLemmatizer()
    lemmatized_posts = []
    for sentence in tokenized_posts:
        tagged = pos_tag(sentence)
        lemmatized_sentence = []
        for word, tag in tagged:
            wntag = get_wordnet_pos(tag)
            if wntag is None:
                lemmatized_sentence.append(word)
            else:
                lemmatized_sentence.append(lemmatizer.lemmatize(word, pos=wntag))
        lemmatized_sentence = " ".join(lemmatized_sentence)
        lemmatized_posts.append(lemmatized_sentence)
    return lemmatized_posts

In [6]:
def create_tokenizer(data):
    all_words = [word for tokens in data["tokens"] for word in tokens]
    VOCAB = sorted(list(set(all_words)))
    tokenizer = Tokenizer(num_words=len(VOCAB), lower=True, char_level=False)
    
    with open('tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    return tokenizer

In [None]:
def tokenize(data, tokenizer, sequence_length):
    tokenizer.fit_on_texts(data["lemmatized_posts"])
    sequences = tokenizer.texts_to_sequences(data["lemmatized_posts"])
    cnn_data = pad_sequences(sequences, maxlen=sequence_length)
    return cnn_data

In [7]:
def get_vocab_size(tokenizer):
    vocab_size = len(tokenizer.word_index) + 1  
    return vocab_size

In [8]:
#glove_file = "glove.6B.50d.txt"
#vocab_size = get_vocab_size()
#dim = 50

def get_embedding_matrix(dim, vocab_size, glove_file, word_index):
    embedding_matrix = np.zeros((vocab_size, dim))
    with open( glove_file , encoding="utf8") as file:
        for line in file:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:dim]

    return embedding_matrix

In [None]:
def split_target_variable(data, character):
    data['character'] = [characters[character] for characters in data.type]
    return data

In [None]:
def binarize_target_variable(data):
    data['binarized_target'] = data.character.astype('category').cat.codes
    return data

In [None]:
def undersample(data):
    major = 0
    if data.groupby('binarized_target').count().sort_values('posts').index[-1] == 1:
        major = 1
    minor = 1 - major
    no_minor = len(data[data['binarized_target'] == minor])
    
    major_indices = data[data.binarized_target == major].index
    random_indices = np.random.choice(major_indices,no_minor, replace=False)
    minor_indices = data[data.binarized_target == minor].index
    
    under_sample_indices = np.concatenate([minor_indices,random_indices])
    under_sample = data.loc[under_sample_indices]
    return under_sample

In [None]:
def tabular_features(tabular_data, tabular_train, tabular_test):
    tabular_train_data = pd.DataFrame()
    tabular_test_data = pd.DataFrame()
    for i in np.arange(1,4):
        tfidf = TfidfVectorizer(stop_words='english',ngram_range=(i,i), decode_error='replace', max_features = 100000)
        tabular_word_data = tfidf.fit(tabular_data['posts'].values.astype('U'))
        tabular_word_train = tfidf.fit_transform(tabular_train['posts'].values.astype('U'))
        tabular_word_test = tfidf.transform(tabular_test['posts'].values.astype('U'))


        tsvd = TruncatedSVD(n_components=500, algorithm='arpack', random_state=500)
        tabular_wordie_train = tsvd.fit_transform(tabular_word_train)
        tabular_wordie_test = tsvd.transform(tabular_word_test)
        tabular_wordie_train_df = pd.DataFrame(tabular_wordie_train,
                                        columns=[str(i)+'_'+str(b) for b in np.arange(1,tabular_wordie_train.shape[1]+1)])
        tabular_wordie_test_df = pd.DataFrame(tabular_wordie_test,
                                       columns=[str(i)+'_'+str(b) for b in np.arange(1,tabular_wordie_test.shape[1]+1)])
        tabular_train_data = pd.concat([tabular_train_data,tabular_wordie_train_df], axis=1)
        tabular_test_data = pd.concat([tabular_test_data,tabular_wordie_test_df], axis=1)
    return tabular_train_data, tabular_test_data

In [1]:
def tabular_scaler(tabular_train_data, tabular_test_data):
    scaler = MinMaxScaler()
    tabular_train_data = pd.DataFrame(scaler.fit_transform(tabular_train_data),
                                      columns=tabular_train_data.columns, index=tabular_train_data.index)
    tabular_test_data = pd.DataFrame(scaler.transform(tabular_test_data),
                                      columns=tabular_test_data.columns, index=tabular_test_data.index)
    return tabular_train_data, tabular_test_data

In [None]:
def chi2_features(tabular_train_data, tabular_test_data, tabular_y_train):
    chi2_features = SelectKBest(chi2, k = 100) 
    tabular_train_best_data = pd.DataFrame(chi2_features.fit_transform(tabular_train_data, tabular_y_train))
    tabular_test_best_data = pd.DataFrame(chi2_features.transform(tabular_test_data))
    return tabular_train_best_data, tabular_test_best_data

In [9]:
#check the % of the vocabulary covered by the pretrained model
#nonzero_elements = np.count_nonzero(np.count_nonzero(get_embedding_matrix(50, get_vocab_size(posts),"glove.6B.50d.txt" ), axis=1))
#nonzero_elements / get_vocab_size()