In [1]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import random
import re
import bs4
import gensim
import nltk
import numpy as np
import pandas as pd
import keras.preprocessing.sequence

Using TensorFlow backend.


In [2]:
def load_data(train_file, test_file, extra_file):
    train_set = pd.read_csv(train_file, header=0, sep='\t')
    test_set = pd.read_csv(test_file, header=0, sep='\t')
    extra_set = pd.read_csv(extra_file, header=0, sep='\t', error_bad_lines=False, warn_bad_lines=True)
    print('Train set info:')
    train_set.info()
    print('Test set info:')
    test_set.info()
    print('Extra set info:')
    extra_set.info()

    return train_set, test_set, extra_set


def raw_to_words(df, column, remove_stopwords=False):
    wordnet = nltk.stem.WordNetLemmatizer()
    stopwords = set(nltk.corpus.stopwords.words('english'))

    df[column] = df[column].map(lambda x: bs4.BeautifulSoup(x, 'html.parser').get_text())
    df[column] = df[column].map(lambda x: re.sub(r'[^a-zA-Z,!?\'\`]', ' ', x))
    df[column] = df[column].map(lambda x: x.lower().split())
    df[column] = df[column].map(lambda x: [wordnet.lemmatize(y) for y in x])
    if remove_stopwords:
        df[column] = df[column].map(lambda x: [y for y in x if y not in stopwords])

    df['num_words'] = df[column].apply(lambda x: len(x))

    return df


def raw_to_texts(df, column, remove_stopwords=False):
    df = raw_to_words(df, column, remove_stopwords)
    print(df.describe())
    df[column] = df[column].map(lambda x: ' '.join(x))
    return df

In [3]:
train_set, test_set, extra_set = load_data(r'E:\OpenSourceDatasetCode\Dataset\Bag of Words Meets Bags of Popcorn\labeledTrainData.tsv',
                                           r'E:\OpenSourceDatasetCode\Dataset\Bag of Words Meets Bags of Popcorn\testData.tsv',
                                           r'E:\OpenSourceDatasetCode\Dataset\Bag of Words Meets Bags of Popcorn\unlabeledTrainData.tsv')

train_df = raw_to_texts(train_set, 'review', remove_stopwords=True)
test_df = raw_to_texts(test_set, 'review', remove_stopwords=True)
extra_df = raw_to_texts(extra_set, 'review', remove_stopwords=True)

b'Skipping line 43043: expected 2 fields, saw 3\n'


Train set info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
id           25000 non-null object
sentiment    25000 non-null int64
review       25000 non-null object
dtypes: int64(1), object(2)
memory usage: 586.0+ KB
Test set info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
id        25000 non-null object
review    25000 non-null object
dtypes: object(2)
memory usage: 390.7+ KB
Extra set info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49998 entries, 0 to 49997
Data columns (total 2 columns):
id        49998 non-null object
review    49998 non-null object
dtypes: object(2)
memory usage: 781.3+ KB


         sentiment     num_words
count  25000.00000  25000.000000
mean       0.50000    127.947480
std        0.50001     96.277805
min        0.00000      5.000000
25%        0.00000     68.000000
50%        0.50000     95.000000
75%        1.00000    156.000000
max        1.00000   1463.000000


          num_words
count  25000.000000
mean     124.983120
std       93.685699
min        3.000000
25%       68.000000
50%       93.000000
75%      152.000000
max     1248.000000


          num_words
count  49998.000000
mean     128.484659
std       96.426765
min        3.000000
25%       69.000000
50%       95.000000
75%      157.000000
max     1455.000000


In [4]:
train_Y = train_df.sentiment
del train_df['sentiment']

In [5]:
all_reviews = pd.concat([train_df, test_df, extra_df], ignore_index=True)

In [6]:
from keras.preprocessing import text
sequence_tokenizer = text.Tokenizer()
sequence_tokenizer.fit_on_texts(line for line in all_reviews['review'].values)

In [25]:
dic_len = len(sequence_tokenizer.word_index)

In [26]:
dic_len

168114

In [27]:
from keras.preprocessing import sequence
def texts_to_sequences(df, column, tokenizer, maxlen=300):
    seq = tokenizer.texts_to_sequences(line for line in df[column].values)
    print(type(seq[0]))
    print('mean:', np.mean([len(x) for x in seq]))
    print('std:', np.std([len(x) for x in seq]))
    print('median:', np.median([len(x) for x in seq]))
    print('max:', np.max([len(x) for x in seq]))
    seq = sequence.pad_sequences(seq, maxlen=maxlen, padding='post', truncating='post')
    return seq

In [31]:
train_X = texts_to_sequences(train_df, 'review', sequence_tokenizer, maxlen=1500)
del train_df

<class 'list'>
mean: 127.5788
std: 95.8301327901
median: 95.0
max: 1462
