In [1]:
import pandas as pd

In [4]:
filepath_dict = {'yelp':   'sentiment_analysis/yelp_labelled.txt',
                 'amazon': 'sentiment_analysis/amazon_cells_labelled.txt',
                 'imdb':   'sentiment_analysis/imdb_labelled.txt'}

In [5]:
df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)
print(df.iloc[0])

sentence    Wow... Loved this place.
label                              1
source                          yelp
Name: 0, dtype: object


In [6]:
sentences = ['John likes ice cream', 'John hates chocolate.']

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
vectorizer = CountVectorizer(min_df=0, lowercase=False)

In [9]:
vectorizer.fit(sentences)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=False, max_df=1.0, max_features=None, min_df=0,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [10]:
vectorizer.vocabulary_

{'John': 0, 'chocolate': 1, 'cream': 2, 'hates': 3, 'ice': 4, 'likes': 5}

In [11]:
vectorizer.transform(sentences).toarray()

array([[1, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 0, 0]])

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
df_yelp = df[df['source'] == 'yelp']

In [15]:
sentences = df_yelp['sentence'].values

In [16]:
y=df_yelp['label'].values

In [18]:
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state = 1000)

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
vectorizet = CountVectorizer()

In [21]:
vectorizer.fit(sentences_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=False, max_df=1.0, max_features=None, min_df=0,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [22]:
X_train = vectorizer.transform(sentences_train)

In [23]:
X_test = vectorizer.transform(sentences_test)

In [24]:
X_train

<750x1938 sparse matrix of type '<class 'numpy.int64'>'
	with 7453 stored elements in Compressed Sparse Row format>

In [30]:
from sklearn.linear_model import LogisticRegression

In [31]:
clf = LogisticRegression()

In [32]:
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
score = clf.score(X_test, y_test)

In [34]:
score

0.772

In [37]:
for source in df['source'].unique():
  df_source = df[df['source'] == source]
  sentences = df_source['sentence'].values
  y = df_source['label'].values

  sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, train_size=0.8, random_state=1000)
  vectorizer = CountVectorizer()
  vectorizer.fit(sentences_train)
  X_train = vectorizer.transform(sentences_train)
  X_test = vectorizer.transform(sentences_test)

  classifier = LogisticRegression()
  classifier.fit(X_train, y_train)
  score = classifier.score(X_test, y_test)

  print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.7850
Accuracy for amazon data: 0.8000
Accuracy for imdb data: 0.7867


In [38]:
from keras.models import Sequential
from keras import layers

In [39]:
input_dim = X_train.shape[1]

In [40]:
input_dim

2582

In [41]:
model = Sequential()

In [43]:
model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

In [44]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 10)                25830     
_________________________________________________________________
dense_1 (Dense)              (None, 10)                110       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 25,951
Trainable params: 25,951
Non-trainable params: 0
_________________________________________________________________


In [45]:
history = model.fit(X_train,y_train, epochs=100, verbose=False, validation_data=(X_test, y_test), batch_size=10)

  "shape. This may consume a large amount of memory." % value)


In [46]:
loss, accuracy = model.evaluate(X_train, y_train)



In [47]:
loss, accuracy = model.evaluate(X_test, y_test)



In [50]:
cities = ['London', 'Berlin', 'Berlin', 'New York', 'London']

In [51]:
from sklearn.preprocessing import LabelEncoder

In [53]:
encoder = LabelEncoder()

In [54]:
city_labels = encoder.fit_transform(cities)

In [55]:
city_labels

array([1, 0, 0, 2, 1])

In [56]:
from sklearn.preprocessing import OneHotEncoder

In [59]:
encoder = OneHotEncoder(sparse=False)

In [60]:
city_labels = city_labels.reshape((5,1))

In [61]:
city_labels

array([[1],
       [0],
       [0],
       [2],
       [1]])

In [62]:
encoder.fit_transform(city_labels)

array([[0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [63]:
from keras.preprocessing.text import Tokenizer

In [64]:
tokenizer = Tokenizer(num_words=5000)

In [65]:
tokenizer.fit_on_texts(sentences_train)

In [67]:
X_train = tokenizer.texts_to_sequences(sentences_train)

In [68]:
X_test = tokenizer.texts_to_sequences(sentences_test)

In [69]:
vocab_size = len(tokenizer.word_index) + 1

In [70]:
sentences_train[2]

'It was too predictable, even for a chick flick.  '

In [71]:
X_train[2]

[8, 12, 100, 159, 47, 16, 2, 976, 406]

In [73]:
from keras.preprocessing.sequence import pad_sequences

In [74]:
maxlen = 100
X_train = pad_sequences(X_train, maxlen=maxlen, padding='post')
X_test = pad_sequences(X_test, maxlen=maxlen, padding='post')

In [75]:
X_train[0]

array([ 10, 234,   6, 972, 973,  39,   2, 974,   9, 588, 975,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0], dtype=int32)

In [76]:
from keras.models import Sequential
from keras import layers

In [77]:
embedding_dim = 50
model = Sequential()

model.add(layers.Embedding(input_dim=vocab_size, output_dim = embedding_dim, input_length=maxlen))

In [78]:
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [79]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 50)           132750    
_________________________________________________________________
flatten (Flatten)            (None, 5000)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                50010     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 11        
Total params: 182,771
Trainable params: 182,771
Non-trainable params: 0
_________________________________________________________________


In [80]:
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test), batch_size=10)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [81]:
model.evaluate(X_train, y_train, verbose=False)

[0.0006453096284531057, 1.0]

In [82]:
model.evaluate(X_test, y_test, verbose=False)

[0.8463332056999207, 0.6733333468437195]

In [83]:
embedding_dim = 50

model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 50)           132750    
_________________________________________________________________
global_max_pooling1d (Global (None, 50)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 10)                510       
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 11        
Total params: 133,271
Trainable params: 133,271
Non-trainable params: 0
_________________________________________________________________


In [84]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=50, verbose= False)

In [85]:
model.evaluate(X_train, y_train, verbose=False)

[0.0008252214756794274, 1.0]

In [86]:
model.evaluate(X_test, y_test, verbose=False)

[0.6039760112762451, 0.7933333516120911]