In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('labeledTrainData.tsv', delimiter='\t')
test = pd.read_csv('testData.tsv', delimiter='\t')
submission = pd.read_csv('sampleSubmission.csv')
imdb_df = pd.read_csv('imdb_master.csv', encoding="latin-1") #추가 데이터

In [4]:
df.isnull().sum() 

id           0
sentiment    0
review       0
dtype: int64

In [5]:
#column 이름 바꾸기
imdb_df = imdb_df[['review', 'label']]
imdb_df.columns = ["review","sentiment"]
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   review     100000 non-null  object
 1   sentiment  100000 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB


In [6]:
unsup = imdb_df[imdb_df['sentiment'] == 'unsup'].index
unsup

Int64Index([50000, 50001, 50002, 50003, 50004, 50005, 50006, 50007, 50008,
            50009,
            ...
            99990, 99991, 99992, 99993, 99994, 99995, 99996, 99997, 99998,
            99999],
           dtype='int64', length=50000)

In [7]:
imdb_df.drop(unsup, axis='index',inplace=True)
imdb_df['sentiment'].unique()

array(['neg', 'pos'], dtype=object)

In [8]:
imdb_df['sentiment'] = imdb_df['sentiment'].replace({'neg':0, 'pos':1})
imdb_df['sentiment']

0        0
1        0
2        0
3        0
4        0
        ..
49995    1
49996    1
49997    1
49998    1
49999    1
Name: sentiment, Length: 50000, dtype: int64

In [9]:
df = df.drop('id', axis=1)

In [10]:
#데이터 합치기
df = pd.concat([df, imdb_df]).reset_index(drop = True)
df

Unnamed: 0,sentiment,review
0,1,With all this stuff going down at the moment w...
1,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,0,The film starts with a manager (Nicholas Bell)...
3,0,It must be assumed that those who praised this...
4,1,Superbly trashy and wondrously unpretentious 8...
...,...,...
74995,1,"Seeing as the vote average was pretty low, and..."
74996,1,"The plot had some wretched, unbelievable twist..."
74997,1,I am amazed at how this movie(and most others ...
74998,1,A Christmas Together actually came before my t...


In [11]:
#데이터 전처리
df['review_mo']=df['review'].apply(lambda x: x.lower())
df.head()

Unnamed: 0,sentiment,review,review_mo
0,1,With all this stuff going down at the moment w...,with all this stuff going down at the moment w...
1,1,"\The Classic War of the Worlds\"" by Timothy Hi...","\the classic war of the worlds\"" by timothy hi..."
2,0,The film starts with a manager (Nicholas Bell)...,the film starts with a manager (nicholas bell)...
3,0,It must be assumed that those who praised this...,it must be assumed that those who praised this...
4,1,Superbly trashy and wondrously unpretentious 8...,superbly trashy and wondrously unpretentious 8...


In [12]:
import re
p=re.compile("[0-9]+")
df['review_mo']=df['review_mo'].apply(lambda x: p.sub(" ", x))

p=re.compile("\W+")
df['review_mo']=df['review_mo'].apply(lambda x: p.sub(" ", x))

df.head()

Unnamed: 0,sentiment,review,review_mo
0,1,With all this stuff going down at the moment w...,with all this stuff going down at the moment w...
1,1,"\The Classic War of the Worlds\"" by Timothy Hi...",the classic war of the worlds by timothy hine...
2,0,The film starts with a manager (Nicholas Bell)...,the film starts with a manager nicholas bell g...
3,0,It must be assumed that those who praised this...,it must be assumed that those who praised this...
4,1,Superbly trashy and wondrously unpretentious 8...,superbly trashy and wondrously unpretentious s...


In [13]:
from konlpy.tag import Okt
okt=Okt() 

df['review_mo'] = df['review_mo'].apply(okt.morphs)
df.head()

Unnamed: 0,sentiment,review,review_mo
0,1,With all this stuff going down at the moment w...,"[with, all, this, stuff, going, down, at, the,..."
1,1,"\The Classic War of the Worlds\"" by Timothy Hi...","[the, classic, war, of, the, worlds, by, timot..."
2,0,The film starts with a manager (Nicholas Bell)...,"[the, film, starts, with, a, manager, nicholas..."
3,0,It must be assumed that those who praised this...,"[it, must, be, assumed, that, those, who, prai..."
4,1,Superbly trashy and wondrously unpretentious 8...,"[superbly, trashy, and, wondrously, unpretenti..."


In [14]:
from nltk.corpus import stopwords  
stop_words = set(stopwords.words('english'))

def remove_stopwords(word_tokens):
    result = []
    for w in word_tokens: 
        if w not in stop_words: 
            result.append(w) 
    return result

df['review_mo']=df['review_mo'].apply(lambda x: remove_stopwords(x))

df.head()

Unnamed: 0,sentiment,review,review_mo
0,1,With all this stuff going down at the moment w...,"[stuff, going, moment, mj, started, listening,..."
1,1,"\The Classic War of the Worlds\"" by Timothy Hi...","[classic, war, worlds, timothy, hines, enterta..."
2,0,The film starts with a manager (Nicholas Bell)...,"[film, starts, manager, nicholas, bell, giving..."
3,0,It must be assumed that those who praised this...,"[must, assumed, praised, film, greatest, filme..."
4,1,Superbly trashy and wondrously unpretentious 8...,"[superbly, trashy, wondrously, unpretentious, ..."


In [15]:
from nltk.stem import WordNetLemmatizer
n=WordNetLemmatizer()

df['review_mo']=df['review_mo'].apply(lambda x: [n.lemmatize(w) for w in x])

df.head()

Unnamed: 0,sentiment,review,review_mo
0,1,With all this stuff going down at the moment w...,"[stuff, going, moment, mj, started, listening,..."
1,1,"\The Classic War of the Worlds\"" by Timothy Hi...","[classic, war, world, timothy, hines, entertai..."
2,0,The film starts with a manager (Nicholas Bell)...,"[film, start, manager, nicholas, bell, giving,..."
3,0,It must be assumed that those who praised this...,"[must, assumed, praised, film, greatest, filme..."
4,1,Superbly trashy and wondrously unpretentious 8...,"[superbly, trashy, wondrously, unpretentious, ..."


In [23]:
X_train = df['review_mo']
y_train = df['sentiment']

In [24]:
from tensorflow.keras.preprocessing.text import Tokenizer
max_features = 3000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)

In [25]:
print(X_train[1])

[218, 168, 83, 326, 3, 417, 31, 21, 461, 1317, 1682, 1015, 16, 218, 137, 268, 2579, 189, 3, 2334, 89, 637, 590, 242, 2137, 52, 79, 38, 613, 1015, 2974, 180, 668, 137, 417, 191, 44, 176, 26, 2, 1913, 957, 44, 174, 295, 721, 2, 532, 2295, 5, 2047, 20, 43, 887, 957, 393, 461, 268, 142, 2566, 1682, 1015, 16, 218, 438, 159, 326, 27, 628, 957]


In [26]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
maxlen=125
X_train = pad_sequences(X_train, maxlen=maxlen)

In [50]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout
embed_size = 64

model2 = Sequential()
model2.add(Embedding(max_features, embed_size)) #정수 인코딩 된 단어들을 밀집 벡터로 
model2.add(Bidirectional(LSTM(16, return_sequences = True)))
model2.add(GlobalMaxPool1D())
model2.add(Dense(32))
model2.add(Dropout(0.1))
model2.add(Dense(1, activation="sigmoid"))
model2.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, None, 64)          192000    
                                                                 
 bidirectional_6 (Bidirectio  (None, None, 32)         10368     
 nal)                                                            
                                                                 
 global_max_pooling1d_6 (Glo  (None, 32)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_12 (Dense)            (None, 32)                1056      
                                                                 
 dropout_6 (Dropout)         (None, 32)                0         
                                                                 
 dense_13 (Dense)            (None, 1)                

In [51]:
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#early_stopping = EarlyStopping(patience = 3)
history2 = model2.fit(X_train,y_train, batch_size=300, epochs=8, validation_split=0.2)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [32]:
test['review_mo']=test['review'].apply(lambda x: x.lower())

p1 = re.compile("[0-9]+")
test['review_mo']=test['review_mo'].apply(lambda x: p1.sub(" ", x))

p2 = re.compile("\W+")
test['review_mo']=test['review_mo'].apply(lambda x: p2.sub(" ", x))

In [33]:
test['review_mo'] = test['review_mo'].apply(okt.morphs)

test['review_mo']=test['review_mo'].apply(lambda x: remove_stopwords(x))

In [34]:
test['review_mo']=test['review_mo'].apply(lambda x: [n.lemmatize(w) for w in x])

In [35]:
x_test = test['review_mo']

In [36]:
test = tokenizer.texts_to_sequences(x_test)

In [37]:
test = pad_sequences(test, maxlen=maxlen)

In [52]:
preds_test2= model2.predict(test)



In [53]:
submission2 = pd.read_csv('sampleSubmission.csv')

In [54]:
submission2['sentiment'] = preds_test2
submission2.head()

Unnamed: 0,id,sentiment
0,12311_10,0.999991
1,8348_2,0.005421
2,5828_4,0.237068
3,7186_2,0.507448
4,12128_7,0.945802


In [55]:
submission2['sentiment'] = submission2['sentiment'].round()
submission2.head()

Unnamed: 0,id,sentiment
0,12311_10,1.0
1,8348_2,0.0
2,5828_4,0.0
3,7186_2,1.0
4,12128_7,1.0


In [57]:
submission2.to_csv('submission2.csv',index=False)