## Тема “Создание признакового пространства”

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import numpy as np

PREP_DATA = '../data/prep_tweets.pkl'

#### Загрузим подготовленный датасет твиттов

In [3]:
df_prep = pd.read_pickle(PREP_DATA)
df_prep.head()

Unnamed: 0,id,label,tweet,clean_tweet,tweet_token,tweet_token_filtered,tweet_stemmed,tweet_lemmatized
0,1,0.0,@user when a father is dysfunctional and is s...,when father is dysfunctional and is so selfish...,"[when, father, is, dysfunctional, and, is, so,...","[father, dysfunctional, selfish, drags, kids, ...","[father, dysfunct, selfish, drag, kid, dysfunc...","[father, dysfunctional, selfish, drag, kid, dy..."
1,2,0.0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit cannot use cause they d...,"[thanks, for, lyft, credit, can, not, use, cau...","[thanks, lyft, credit, use, cause, offer, whee...","[thank, lyft, credit, use, caus, offer, wheelc...","[thank, lyft, credit, use, cause, offer, wheel..."
2,3,0.0,bihday your majesty,bihday your majesty,"[bihday, your, majesty]","[bihday, majesty]","[bihday, majesti]","[bihday, majesty]"
3,4,0.0,#model i love u take with u all the time in ...,model love you take with you all the time in ur,"[model, love, you, take, with, you, all, the, ...","[model, love, take, time, ur]","[model, love, take, time, ur]","[model, love, take, time, ur]"
4,5,0.0,factsguide: society now #motivation,factsguide society now motivation,"[factsguide, society, now, motivation]","[factsguide, society, motivation]","[factsguid, societi, motiv]","[factsguide, society, motivation]"


### 1. Создайте мешок слов с помощью CountVectorizer

In [4]:
stemmed_docs = df_prep['tweet_stemmed'].apply(' '.join)

lemmatized_docs = df_prep['tweet_lemmatized'].apply(' '.join)

In [5]:
def _make_bagbag_of_words(vectorizer: object, docs: list) -> object:
    
    bag_of_words = vectorizer.fit_transform(docs)
    
    # Отобразим Bag-of-Words модель как DataFrame
    feature_names = vectorizer.get_feature_names()
    return pd.DataFrame(bag_of_words.toarray(), columns = feature_names).copy()


def make_bagbag_of_words_simple(docs: list) -> object:
   
    count_vectorizer = CountVectorizer(max_df=0.9, max_features = 1000, stop_words='english')
    return _make_bagbag_of_words(count_vectorizer, docs)


In [6]:
#stemmed
stemmed_bow = make_bagbag_of_words_simple(stemmed_docs)
stemmed_bow.head(3)

Unnamed: 0,abl,absolut,accept,account,act,action,activ,actor,actual,ad,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
stemmed_bow.sum().sum()

216726

In [8]:
#lemmatized
lemmatized_bow = make_bagbag_of_words_simple(lemmatized_docs)
lemmatized_bow.head(3)

Unnamed: 0,able,absolutely,accept,account,act,action,actor,actually,adapt,add,...,yesterday,yo,yoga,york,young,youth,youtube,yr,yrs,yummy
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
lemmatized_bow.sum().sum()

204283

### 2. Создайте мешок слов с помощью TfidfVectorizer

In [10]:
def make_bagbag_of_words_tfidf(docs: list) -> object:
   
    vectorizer = TfidfVectorizer(max_df=0.9, max_features = 1000, stop_words='english')
    return _make_bagbag_of_words(vectorizer, docs)


In [11]:
#stemmed
stemmed_tfidf = make_bagbag_of_words_tfidf(stemmed_docs)
stemmed_tfidf.head(3)

Unnamed: 0,abl,absolut,accept,account,act,action,activ,actor,actual,ad,...,yeah,year,yesterday,yo,yoga,york,young,youtub,yr,yummi
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
stemmed_tfidf.sum().sum()

93434.75975058634

In [13]:
#lemmatized
lemmatized_tfidf = make_bagbag_of_words_tfidf(lemmatized_docs)
lemmatized_tfidf.head(3)

Unnamed: 0,able,absolutely,accept,account,act,action,actor,actually,adapt,add,...,yesterday,yo,yoga,york,young,youth,youtube,yr,yrs,yummy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
lemmatized_tfidf.sum().sum()

90534.85481794423

### 3. Натренируем gensim.models.Word2Vec модель на наших данных

In [15]:
# !pip install gensim

In [16]:
from gensim.models import Word2Vec

In [17]:
tokenized_docs = df_prep['tweet_token']
tokenized_docs.head()

0    [when, father, is, dysfunctional, and, is, so,...
1    [thanks, for, lyft, credit, can, not, use, cau...
2                              [bihday, your, majesty]
3    [model, love, you, take, with, you, all, the, ...
4               [factsguide, society, now, motivation]
Name: tweet_token, dtype: object

In [18]:
%%time
model_w2v = Word2Vec(tokenized_docs, 
              size=200, 
              window=5, 
              min_count=2, 
              sg = 1, 
              hs = 0, 
              negative = 10, 
              workers= 32, 
              seed = 34)

CPU times: user 1min 10s, sys: 92.6 ms, total: 1min 10s
Wall time: 25.7 s


In [19]:
%%time
model_w2v.train(tokenized_docs, total_examples=tokenized_docs.size, epochs=20)

CPU times: user 3min 55s, sys: 432 ms, total: 3min 55s
Wall time: 1min


(9140310, 11726520)

### потестируем нашу модель Word2Vec 

In [20]:
# выведим слово наиболее близкое к 'dinner'
result = model_w2v.most_similar(positive=['dinner'])
print("{}: {:.4f}".format(*result[0]))
print(model_w2v.similar_by_word("dinner", topn=3))

bihdaydinner: 0.5573
[('bihdaydinner', 0.5572851896286011), ('spaghetti', 0.555306077003479), ('bolognese', 0.550733745098114)]


In [21]:
# выведим слово наиболее близкое к 'trump'
result = model_w2v.most_similar(positive=['trump'])
print("{}: {:.4f}".format(*result[0]))
print(model_w2v.similar_by_word("trump", topn=3))

donald: 0.5575
[('donald', 0.5575395226478577), ('suppoer', 0.5285787582397461), ('crony', 0.5265582203865051)]


### проверим векторное представление

In [22]:
model_w2v['food']

array([-0.09265701, -0.07108294, -0.39404234,  0.3836512 ,  0.28662294,
       -0.05926029,  0.28960693,  0.66081566,  0.05989827,  0.15868618,
       -0.31925347, -0.34518752, -0.04214924, -0.21273781,  0.1769759 ,
       -0.40478966, -0.44924566,  0.34910938, -0.15762265,  0.6244696 ,
        0.32853493, -0.1739553 ,  0.00730438,  0.10333357, -0.08920829,
        0.31456622, -0.43091068, -0.70993495, -0.7917362 ,  0.13339871,
       -0.3066326 , -0.16108833, -0.5332007 , -0.64329714, -0.5787214 ,
        0.285422  ,  0.16875497,  0.01981907, -0.389986  , -0.08095073,
        0.2067487 , -0.15445949,  0.19391172, -0.27844563,  0.23882854,
        0.26701686, -0.84153545,  0.40209776,  0.23610052,  0.08376999,
        0.45181605, -0.41420588,  0.5227954 , -0.19663644, -0.10757492,
        0.04713054,  0.30724207, -0.3553161 ,  0.51247734, -0.3945258 ,
        0.41556054, -0.31315154,  0.62057364,  0.8672947 ,  0.32127017,
       -0.7993134 ,  0.05582834,  0.09274858,  0.22257322,  0.20

In [23]:
df_prep['tweet_token'].apply(lambda x: 'disapointed' in x).sum()

1

In [24]:
model_w2v['food'].size

200

### Необходимо создать вектор для каждого твита

In [25]:
# получаем вектора для каждого слова и затем усредняем
def vectorize_tweet(words: list)-> list or None:
    res = None
    vec = []
    for item in words:
        try:
            vec.append(model_w2v[item])
        except KeyError as err:
            pass
    if len(vec)>0:
        res =  np.average(vec, axis=0)
    return res

In [26]:
%%time
v2w_tweets = tokenized_docs.apply(vectorize_tweet)


CPU times: user 11.4 s, sys: 79.9 ms, total: 11.5 s
Wall time: 11.4 s


In [27]:
v2w_tweets.shape

(49159,)

In [28]:
# проверяем наличие твитов состоящие из слов которых нет в нашем словаре
print(v2w_tweets.isna().sum())
# и удаляем их
v2w_tweets.dropna(inplace=True)
v2w_tweets.shape

20


(49139,)

In [29]:
wordvec_df = pd.DataFrame.from_records(v2w_tweets)
wordvec_df.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.026038,0.075128,0.107611,0.101912,-0.176435,-0.061256,0.125712,-0.076131,-0.050619,0.19105,...,0.32777,0.055676,-0.259871,0.032576,0.158218,-0.163714,-0.022097,-0.150173,0.391357,0.095538
1,0.106415,0.029637,0.132153,0.044252,-0.134078,-0.09123,0.036257,-0.055944,-0.130872,0.211153,...,0.280244,0.120028,-0.348166,-0.036761,0.189935,-0.132732,0.066449,0.02697,0.317535,-0.074957
2,0.435508,0.037012,0.295512,-0.053668,0.354483,0.245542,0.392723,-0.513703,-0.088398,-0.084223,...,-0.363724,0.20338,-0.679096,0.175086,0.105418,-0.193322,0.073542,0.109717,0.115645,0.183119


In [30]:
wordvec_df.shape

(49139, 200)

In [31]:
wordvec_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49139 entries, 0 to 49138
Columns: 200 entries, 0 to 199
dtypes: float64(200)
memory usage: 75.0 MB
