In [1]:
import pandas as pd
import numpy as np
import gensim
import os
import collections
import smart_open
import random

In [2]:
def read_corpus(fname, tokens_only=False):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if tokens_only:
                yield gensim.utils.simple_preprocess(line)
            else:
                # For training data, add tags
                if i==0:
                    pass
                else:
                    yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line)[1:], 
                                                            gensim.utils.simple_preprocess(line)[0]) 

In [6]:
all_data = list(read_corpus("C:/Users/82104/Desktop/객체 팀플/dataset/mbti_1.csv"))
total_num_obs = len(all_data)

In [7]:
from math import floor, ceil
train_corpus = all_data[0:floor(3*total_num_obs/4)]
test_corpus = all_data[floor(3*total_num_obs/4):]

In [9]:
model = gensim.models.doc2vec.Doc2Vec(size=100, min_count=2, epochs=55)

In [10]:
model.build_vocab(train_corpus)

In [12]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

Wall time: 8min 3s


In [13]:
model.infer_vector(['I', 'feel', 'sad'])

array([-1.23292856e-01, -1.33478984e-01, -2.82753050e-01, -3.36730331e-01,
       -3.03849041e-01,  3.03270549e-01,  2.20063701e-01, -7.24150166e-02,
       -1.67995900e-01,  2.92025715e-01,  2.96790242e-01, -1.60492107e-01,
        4.36984934e-02,  3.05536151e-01,  2.10708156e-01,  1.51197715e-02,
        9.90348123e-03,  1.91438198e-01,  2.35202149e-01, -1.36056811e-01,
        1.95286632e-01, -1.72273919e-01,  3.63269821e-02,  1.67890191e-02,
        2.30257567e-02, -1.22770682e-01,  9.29395258e-02, -1.29292682e-01,
        9.13629979e-02,  5.38202412e-02, -1.96423605e-01,  5.09742856e-01,
        1.62476636e-02, -1.95929945e-01, -3.92628998e-01,  5.09191871e-01,
        4.80559707e-01,  2.83546537e-01, -4.19738889e-01, -9.50777158e-02,
        3.27560514e-01, -2.16804355e-01, -7.84175247e-02, -9.63010043e-02,
       -2.66758859e-01, -2.77889073e-01, -3.91735822e-01, -1.85599282e-01,
        4.70405109e-02,  2.15466484e-01,  1.54191211e-01, -1.60498515e-01,
        1.29620120e-01, -

In [14]:
train_targets, train_regressors = zip(*[(doc.words, doc.tags[0]) for doc in train_corpus])
test_targets, test_regressors = zip(*[(doc.words, doc.tags[0]) for doc in test_corpus])

In [15]:
X = []
for i in range(len(train_targets)):
    X.append(model.infer_vector(train_targets[i]))
train_x = np.asarray(X)

In [16]:
train_x.shape

(6506, 100)

In [17]:
Y = np.asarray(train_regressors)
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(Y)
train_y = le.transform(Y)
np.mean(train_y)

0.7682139563479865

In [18]:
unique, counts = np.unique(Y, return_counts=True)

print(np.asarray((unique, counts)).T)

[['e' '1508']
 ['i' '4998']]


In [19]:
from sklearn import linear_model
logreg = linear_model.LogisticRegression()
logreg.fit(train_x, train_y)

LogisticRegression()

In [20]:
test_list = []
for i in range(len(test_targets)):
    test_list.append(model.infer_vector(test_targets[i]))
test_x = np.asarray(test_list)

In [21]:
test_Y = np.asarray(test_regressors)
test_y = le.transform(test_Y)

In [22]:
preds = logreg.predict(test_x)

In [23]:
np.mean(test_y)

0.7736284001844168

In [24]:
sum(preds == test_y) / len(test_y)

0.8206546795758414