In [4]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd

In [5]:
data = pd.read_csv('data/Tweets.csv')
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [6]:
data = data[['airline_sentiment','text']]
data.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [7]:
data.airline_sentiment.unique()

array(['neutral', 'positive', 'negative'], dtype=object)

In [8]:
data.airline_sentiment.value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

In [9]:
data_p = data[data.airline_sentiment == 'positive']
data_n = data[data.airline_sentiment == 'negative']
data_n = data_n.iloc[:len(data_p)]


In [10]:
data = pd.concat([data_n,data_p])
data = data.sample(len(data))
data

Unnamed: 0,airline_sentiment,text
2036,negative,"@united hey, it's 4am, guess what I'm doing? N..."
14360,positive,@AmericanAir I would like to thank the custome...
111,positive,"@VirginAmerica has getaway deals through May, ..."
863,negative,@united I send an email about my bad experienc...
1605,negative,"@united flight 3870 to Newark, stuck in the ru..."
...,...,...
5829,positive,"@SouthwestAir Sorry to spam, it would just be ..."
12833,positive,@AmericanAir no kidding! Gonna take some beati...
16,positive,@VirginAmerica So excited for my first cross c...
349,negative,@VirginAmerica It's fine. Already done with my...


In [11]:
data['review']=(data.airline_sentiment == "positive").astype('int')
del data['airline_sentiment']
data


Unnamed: 0,text,review
2036,"@united hey, it's 4am, guess what I'm doing? N...",0
14360,@AmericanAir I would like to thank the custome...,1
111,"@VirginAmerica has getaway deals through May, ...",1
863,@united I send an email about my bad experienc...,0
1605,"@united flight 3870 to Newark, stuck in the ru...",0
...,...,...
5829,"@SouthwestAir Sorry to spam, it would just be ...",1
12833,@AmericanAir no kidding! Gonna take some beati...,1
16,@VirginAmerica So excited for my first cross c...,1
349,@VirginAmerica It's fine. Already done with my...,0


tf.keras.layers.Embedding 把文本向量化

In [12]:
import re

token = re.compile('[A-Za-z]+|[!?,.()]')


In [13]:
def reg_text(text):
    new_text = token.findall(text)
    new_text = [word.lower() for word in new_text]
    return new_text

In [14]:
data['text']=data.text.apply(reg_text)
data

Unnamed: 0,text,review
2036,"[united, hey, ,, it, s, am, ,, guess, what, i,...",0
14360,"[americanair, i, would, like, to, thank, the, ...",1
111,"[virginamerica, has, getaway, deals, through, ...",1
863,"[united, i, send, an, email, about, my, bad, e...",0
1605,"[united, flight, to, newark, ,, stuck, in, the...",0
...,...,...
5829,"[southwestair, sorry, to, spam, ,, it, would, ...",1
12833,"[americanair, no, kidding, !, gonna, take, som...",1
16,"[virginamerica, so, excited, for, my, first, c...",1
349,"[virginamerica, it, s, fine, ., already, done,...",0


In [15]:
word_set = set()
for text in data.text:
    for word in text:
        word_set.add(word)

In [16]:
max_word = len(word_set) + 1

In [17]:
word_list = list(word_set)
word_list

['autoresponse',
 'turning',
 'dfpietra',
 'yeniettelswood',
 'vahdekvoke',
 'careers',
 'robert',
 'apron',
 'hey',
 'screaming',
 'eco',
 'flightattendant',
 'refundprocedurenottoopainful',
 'literally',
 'dunno',
 'requesting',
 'swapped',
 'newlifetimecustomer',
 'customersfirst',
 'silver',
 'worn',
 'osjz',
 'notification',
 'scl',
 'alynewton',
 'order',
 'earned',
 'sitting',
 'clockwork',
 'reading',
 'refusing',
 'logged',
 'serve',
 'sorted',
 'recommend',
 '?',
 'dismissed',
 'volkswagen',
 'replace',
 'europe',
 'delta',
 'dandy',
 'limits',
 'pattern',
 'pressurecooker',
 'apologies',
 'swa',
 'push',
 'aquadilla',
 'rapidly',
 'define',
 'aired',
 'ksgcq',
 'september',
 'umosaicmecrazy',
 'notcool',
 'affect',
 'miaa',
 'mails',
 'peer',
 'selection',
 'comfortably',
 'nocharge',
 'alwaysdelayedonunited',
 'communication',
 'dnstitrzwy',
 'releasing',
 'accommodating',
 'definitive',
 'cave',
 'gma',
 'xcvqxykg',
 'downgrade',
 'supervisors',
 'assistance',
 'sacintlair

In [18]:
word_index = dict((word,word_list.index(word)+1) for word in word_list)
word_index

{'autoresponse': 1,
 'turning': 2,
 'dfpietra': 3,
 'yeniettelswood': 4,
 'vahdekvoke': 5,
 'careers': 6,
 'robert': 7,
 'apron': 8,
 'hey': 9,
 'screaming': 10,
 'eco': 11,
 'flightattendant': 12,
 'refundprocedurenottoopainful': 13,
 'literally': 14,
 'dunno': 15,
 'requesting': 16,
 'swapped': 17,
 'newlifetimecustomer': 18,
 'customersfirst': 19,
 'silver': 20,
 'worn': 21,
 'osjz': 22,
 'notification': 23,
 'scl': 24,
 'alynewton': 25,
 'order': 26,
 'earned': 27,
 'sitting': 28,
 'clockwork': 29,
 'reading': 30,
 'refusing': 31,
 'logged': 32,
 'serve': 33,
 'sorted': 34,
 'recommend': 35,
 '?': 36,
 'dismissed': 37,
 'volkswagen': 38,
 'replace': 39,
 'europe': 40,
 'delta': 41,
 'dandy': 42,
 'limits': 43,
 'pattern': 44,
 'pressurecooker': 45,
 'apologies': 46,
 'swa': 47,
 'push': 48,
 'aquadilla': 49,
 'rapidly': 50,
 'define': 51,
 'aired': 52,
 'ksgcq': 53,
 'september': 54,
 'umosaicmecrazy': 55,
 'notcool': 56,
 'affect': 57,
 'miaa': 58,
 'mails': 59,
 'peer': 60,
 'sel

In [19]:
data_ok = data.text.apply(lambda x: [word_index.get(word, 0) for word in x])

In [20]:
maxlen = max(len(x) for x in data_ok)

In [21]:
data_ok = keras.preprocessing.sequence.pad_sequences(data_ok.values,maxlen)
data_ok.shape

(4726, 40)

In [22]:
model = keras.Sequential()

Embeding: 把文本映射为一个密集向量


In [23]:
model.add(layers.Embedding(max_word,50,input_length=maxlen))
model.add(layers.LSTM(64))
model.add(layers.Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 40, 50)            355050    
_________________________________________________________________
lstm (LSTM)                  (None, 64)                29440     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 384,555
Trainable params: 384,555
Non-trainable params: 0
_________________________________________________________________


In [24]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['acc']
)

In [25]:
model.fit(data_ok, data.review.values, epochs=10, batch_size=128, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1494123da90>