In [26]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import pandas as pd
from keras.layers.core import Masking

In [27]:
data = pd.read_csv('data/Tweets.csv')
data.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [28]:
data = data[['airline_sentiment','text']]
data.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [29]:
data.airline_sentiment.unique()

array(['neutral', 'positive', 'negative'], dtype=object)

In [30]:
data.airline_sentiment.value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

In [31]:
data_p = data[data.airline_sentiment == 'positive']
data_n = data[data.airline_sentiment == 'negative']
data_n = data_n.iloc[:len(data_p)]


In [32]:
data = pd.concat([data_n,data_p])
data = data.sample(len(data))
data

Unnamed: 0,airline_sentiment,text
1826,negative,@united When my United flight arrives Late Fli...
1913,negative,"@united. If you show available seats, you need..."
1935,negative,"@united, more lies... http://t.co/BEqoTLNugc"
13599,positive,@AmericanAir @USAirways wonderful FC FA on fli...
1988,negative,"@united help me, united! I paid for economy pl..."
...,...,...
1187,positive,@united The DEN b44 agent (9:30am) was amazing...
7176,positive,@JetBlue thank you. Appreciate that!!
3065,negative,@united I had the worst customer experience at...
396,negative,@VirginAmerica Can I get some help with a supp...


In [33]:
data['review']=(data.airline_sentiment == "positive").astype('int')
del data['airline_sentiment']
data


Unnamed: 0,text,review
1826,@united When my United flight arrives Late Fli...,0
1913,"@united. If you show available seats, you need...",0
1935,"@united, more lies... http://t.co/BEqoTLNugc",0
13599,@AmericanAir @USAirways wonderful FC FA on fli...,1
1988,"@united help me, united! I paid for economy pl...",0
...,...,...
1187,@united The DEN b44 agent (9:30am) was amazing...,1
7176,@JetBlue thank you. Appreciate that!!,1
3065,@united I had the worst customer experience at...,0
396,@VirginAmerica Can I get some help with a supp...,0


tf.keras.layers.Embedding 把文本向量化

In [34]:
import re

token = re.compile('[A-Za-z]+|[!?,.()]')


In [35]:
def reg_text(text):
    new_text = token.findall(text)
    new_text = [word.lower() for word in new_text]
    return new_text

In [36]:
data['text']=data.text.apply(reg_text)
data

Unnamed: 0,text,review
1826,"[united, when, my, united, flight, arrives, la...",0
1913,"[united, ., if, you, show, available, seats, ,...",0
1935,"[united, ,, more, lies, ., ., ., http, t, ., c...",0
13599,"[americanair, usairways, wonderful, fc, fa, on...",1
1988,"[united, help, me, ,, united, !, i, paid, for,...",0
...,...,...
1187,"[united, the, den, b, agent, (, am, ), was, am...",1
7176,"[jetblue, thank, you, ., appreciate, that, !, !]",1
3065,"[united, i, had, the, worst, customer, experie...",0
396,"[virginamerica, can, i, get, some, help, with,...",0


In [37]:
word_set = set()
for text in data.text:
    for word in text:
        word_set.add(word)

In [38]:
max_word = len(word_set) + 1

In [39]:
word_list = list(word_set)
word_list

['autoresponse',
 'turning',
 'dfpietra',
 'careers',
 'vahdekvoke',
 'yeniettelswood',
 'robert',
 'apron',
 'hey',
 'eco',
 'screaming',
 'flightattendant',
 'refundprocedurenottoopainful',
 'literally',
 'dunno',
 'requesting',
 'swapped',
 'newlifetimecustomer',
 'customersfirst',
 'silver',
 'worn',
 'osjz',
 'notification',
 'scl',
 'alynewton',
 'earned',
 'order',
 'sitting',
 'clockwork',
 'reading',
 'refusing',
 'logged',
 'serve',
 'sorted',
 'recommend',
 '?',
 'dismissed',
 'replace',
 'volkswagen',
 'europe',
 'delta',
 'dandy',
 'limits',
 'pattern',
 'pressurecooker',
 'apologies',
 'swa',
 'push',
 'aquadilla',
 'rapidly',
 'define',
 'aired',
 'ksgcq',
 'september',
 'umosaicmecrazy',
 'notcool',
 'affect',
 'miaa',
 'mails',
 'peer',
 'selection',
 'comfortably',
 'nocharge',
 'alwaysdelayedonunited',
 'communication',
 'releasing',
 'dnstitrzwy',
 'accommodating',
 'definitive',
 'cave',
 'gma',
 'xcvqxykg',
 'sacintlairport',
 'supervisors',
 'assistance',
 'downg

In [40]:
word_index = dict((word,word_list.index(word)+1) for word in word_list)
word_index

{'autoresponse': 1,
 'turning': 2,
 'dfpietra': 3,
 'careers': 4,
 'vahdekvoke': 5,
 'yeniettelswood': 6,
 'robert': 7,
 'apron': 8,
 'hey': 9,
 'eco': 10,
 'screaming': 11,
 'flightattendant': 12,
 'refundprocedurenottoopainful': 13,
 'literally': 14,
 'dunno': 15,
 'requesting': 16,
 'swapped': 17,
 'newlifetimecustomer': 18,
 'customersfirst': 19,
 'silver': 20,
 'worn': 21,
 'osjz': 22,
 'notification': 23,
 'scl': 24,
 'alynewton': 25,
 'earned': 26,
 'order': 27,
 'sitting': 28,
 'clockwork': 29,
 'reading': 30,
 'refusing': 31,
 'logged': 32,
 'serve': 33,
 'sorted': 34,
 'recommend': 35,
 '?': 36,
 'dismissed': 37,
 'replace': 38,
 'volkswagen': 39,
 'europe': 40,
 'delta': 41,
 'dandy': 42,
 'limits': 43,
 'pattern': 44,
 'pressurecooker': 45,
 'apologies': 46,
 'swa': 47,
 'push': 48,
 'aquadilla': 49,
 'rapidly': 50,
 'define': 51,
 'aired': 52,
 'ksgcq': 53,
 'september': 54,
 'umosaicmecrazy': 55,
 'notcool': 56,
 'affect': 57,
 'miaa': 58,
 'mails': 59,
 'peer': 60,
 'sel

In [41]:
data_ok = data.text.apply(lambda x: [word_index.get(word, 0) for word in x])

In [42]:
maxlen = max(len(x) for x in data_ok)

In [43]:
data_ok = keras.preprocessing.sequence.pad_sequences(data_ok.values,maxlen)
data_ok.shape

(4726, 40)

In [44]:
model = keras.Sequential()

Embeding: 把文本映射为一个密集向量


In [45]:
model.add(layers.Embedding(max_word,50,input_length=maxlen))
model.add(Masking(mask_value=0))
model.add(layers.LSTM(64))
model.add(layers.Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 40, 50)            355050    
_________________________________________________________________
module_wrapper (ModuleWrappe (None, 40, 50)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                29440     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 384,555
Trainable params: 384,555
Non-trainable params: 0
_________________________________________________________________


In [46]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['acc']
)

In [47]:
model.fit(data_ok, data.review.values, epochs=10, batch_size=128, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x143a67749a0>