In [1]:
import os
import sys
while not os.getcwd().endswith('ml'):
    os.chdir('..')
sys.path.insert(0, os.getcwd())

In [2]:
import pandas as pd
import numpy as np
import copy
import matplotlib.pyplot as plt
import pickle
from collections import defaultdict

from helpers.word2vec.converter import *

In [3]:
MAX_SENTENCE_LEN = 50
WORD_REPRESENTATION_LEN = 300
DUMP_BATCH_SIZE = 5000

#### Считывание данных

In [4]:
train = pd.read_csv("kaggle_problems/tweet_sentiment_extraction/train.csv")
test = pd.read_csv("kaggle_problems/tweet_sentiment_extraction/test.csv")

#### Описание данных

In [5]:
train.sample()

Unnamed: 0,textID,text,selected_text,sentiment
2851,eaca6e62af,yep! jimmy buffett FTW!!!... jimmy needs to ...,yep! jimmy buffett FTW!!!... jimmy needs to tw...,neutral


In [6]:
test.sample()

Unnamed: 0,textID,text,sentiment
1659,b8c518e02d,Good prices for bulk SMS and Premium SMS too ...,positive


In [7]:
print(len(test), len(train))

3534 27481


### Word2Vec convertation + save on disk

In [8]:
train = train[~train['text'].isnull()]
test = test[~test['text'].isnull()]

In [9]:
sentence_converter = Converter(tokenizer_type=TokenizerType.tweet_tokenizer)

In [10]:
def preprocessing(data):
    sentence_converter.clear_statistic()
    vectors, cleared_sentences = sentence_converter.convert_sentences(data)
    
    unknown_words = np.sum([i for i in sentence_converter.unknown_words.values() if i is not None])
    known_words = np.sum([i for i in sentence_converter.known_words.values()if i is not None])

    print("unknown_words: {}, known_words: {}, persent unknown words: {}".format( 
          unknown_words, known_words, unknown_words / (unknown_words + known_words)))
    
    return np.array([[
        [i for i in sentence[word_nmb]] 
        if word_nmb < len(sentence) and sentence[word_nmb] is not None
        else np.zeros(WORD_REPRESENTATION_LEN)
        for word_nmb in range(0, MAX_SENTENCE_LEN) 
    ] for sentence in vectors], dtype=np.float16), cleared_sentences, sentence_converter.unknown_words, sentence_converter.known_words


In [11]:
for i in range(0, (test.shape[0] + DUMP_BATCH_SIZE - 1) // DUMP_BATCH_SIZE):
    vectors, test_cleared_sentences, test_unknown_words, test_known_words = \
        preprocessing(test['text'].iloc[i * DUMP_BATCH_SIZE: (i + 1) * DUMP_BATCH_SIZE])
    
    pickle.dump(vectors, open('kaggle_problems/tweet_sentiment_extraction/pickle_dump/test_{}.pkl'.format(i), 'wb'))
    pickle.dump(test_cleared_sentences, open('kaggle_problems/tweet_sentiment_extraction/pickle_dump/test_cleared_sentences_{}.pkl'.format(i), 'wb'))

    pickle.dump(test_unknown_words, open('kaggle_problems/tweet_sentiment_extraction/pickle_dump/test_unknown_words_{}.pkl'.format(i), 'wb'))
    pickle.dump(test_known_words, open('kaggle_problems/tweet_sentiment_extraction/pickle_dump/test_known_words_{}.pkl'.format(i), 'wb'))



unknown_words: 12, known_words: 542, persent unknown words: 0.021660649819494584
unknown_words: 12, known_words: 568, persent unknown words: 0.020689655172413793
unknown_words: 9, known_words: 613, persent unknown words: 0.014469453376205787
unknown_words: 11, known_words: 646, persent unknown words: 0.0167427701674277
unknown_words: 11, known_words: 630, persent unknown words: 0.0171606864274571
unknown_words: 18, known_words: 648, persent unknown words: 0.02702702702702703
unknown_words: 15, known_words: 541, persent unknown words: 0.02697841726618705
unknown_words: 9, known_words: 550, persent unknown words: 0.016100178890876567
unknown_words: 9, known_words: 670, persent unknown words: 0.013254786450662739
unknown_words: 7, known_words: 644, persent unknown words: 0.010752688172043012
unknown_words: 14, known_words: 663, persent unknown words: 0.0206794682422452
unknown_words: 11, known_words: 596, persent unknown words: 0.018121911037891267
unknown_words: 11, known_words: 683, per

KeyboardInterrupt: 

In [None]:
for i in range(0, (train.shape[0] + DUMP_BATCH_SIZE - 1) // DUMP_BATCH_SIZE):
    vectors, train_cleared_sentences, train_unknown_words, train_known_words = \
        preprocessing(train['text'].iloc[i * DUMP_BATCH_SIZE: (i + 1) * DUMP_BATCH_SIZE])
        
    pickle.dump(vectors, open('kaggle_problems/tweet_sentiment_extraction/pickle_dump/train_{}.pkl'.format(i), 'wb'))
    pickle.dump(train_cleared_sentences, open('kaggle_problems/tweet_sentiment_extraction/pickle_dump/train_cleared_sentences_{}.pkl'.format(i), 'wb'))

    pickle.dump(train_unknown_words, open('kaggle_problems/tweet_sentiment_extraction/pickle_dump/train_unknown_words_{}.pkl'.format(i), 'wb'))
    pickle.dump(train_known_words, open('kaggle_problems/tweet_sentiment_extraction/pickle_dump/train_known_words_{}.pkl'.format(i), 'wb'))


In [None]:
for i in range(0, (train.shape[0] + DUMP_BATCH_SIZE - 1) // DUMP_BATCH_SIZE):
    vectors, train_cleared_sentences, train_unknown_words, train_known_words = \
        preprocessing(train['selected_text'].iloc[i * DUMP_BATCH_SIZE: (i + 1) * DUMP_BATCH_SIZE])
        
    pickle.dump(vectors, open('kaggle_problems/tweet_sentiment_extraction/pickle_dump/selected_train_{}.pkl'.format(i), 'wb'))
    pickle.dump(train_cleared_sentences, open('kaggle_problems/tweet_sentiment_extraction/pickle_dump/selected_train_cleared_sentences_{}.pkl'.format(i), 'wb'))

    pickle.dump(train_unknown_words, open('kaggle_problems/tweet_sentiment_extraction/pickle_dump/selected_train_unknown_words_{}.pkl'.format(i), 'wb'))
    pickle.dump(train_known_words, open('kaggle_problems/tweet_sentiment_extraction/pickle_dump/selected_train_known_words_{}.pkl'.format(i), 'wb'))


In [None]:
train_known_words = pickle.load(open('kaggle_problems/tweet_sentiment_extraction/pickle_dump/train_known_words', 'rb'))


In [None]:
sorted(train_unknown_words.items(), key=lambda x : x[1], reverse=True)

In [None]:
sorted(sentence_converter.unknown_words.items(), key=lambda x : x[1], reverse=True)

In [None]:
unknown_words = np.sum([i for i in sentence_converter.unknown_words.values()])
known_words = np.sum([i for i in sentence_converter.known_words.values()])
print(unknown_words / (unknown_words + known_words))

#### Проверка гипотезы

In [None]:
#
# Гипотеза: слова из selected_text образуют подотрезок из text
#
cnt_true = 0
cnt_false = 0

for index, row in train.iterrows():
    if row['selected_text'].lower() in row['text'].lower():
        cnt_true += 1
    else:
        cnt_false += 1
print(cnt_true, cnt_false)

In [12]:
!jupyter nbconvert --to script kaggle_problems/tweet_sentiment_extraction/benchmark.ipynb


[NbConvertApp] Converting notebook kaggle_problems/tweet_sentiment_extraction/benchmark.ipynb to script
[NbConvertApp] Writing 5993 bytes to kaggle_problems/tweet_sentiment_extraction/benchmark.py


In [None]:
# MAX_WORDS = 35

# def selected_text_start(x):
#     start_char = x['text'].find(x['selected_text'])
#     start_word = len(x['text'][:start_char].split())
#     borders = np.zeros(MAX_WORDS, dtype=int)
#     borders[start_word] = 1
#     return borders

# def selected_text_end(x):
#     end_word = np.where(x['start_word'] == 1)[0][0] + len(x['selected_text'].split()) - 1
#     borders = np.zeros(MAX_WORDS, dtype=int)
#     borders[end_word] = 1
#     return borders

# train['start_word'] = train.apply(lambda x: selected_text_start(x), axis=1)
# train['end_word'] = train.apply(lambda x: selected_text_end(x), axis=1)