# import data

In [None]:
import pandas as pd
TRAIN_CSV_PATH = "../input/steam-game-reviews/game_rvw_csvs/107410_Arma3.csv" # 匯入csv
train = pd.read_csv(TRAIN_CSV_PATH, index_col=0).astype(str)

train['review'] = train['review'].str.lower() # 將評論轉小寫
train.head()

In [None]:
train['voted_up'].value_counts()

In [None]:
train['review'].shape

# dealing review

In [None]:
train.drop(columns=['recommendationid','language','timestamp_created','timestamp_updated','votes_up','votes_funny','weighted_vote_score','comment_count','steam_purchase','received_for_free','written_during_early_access','author.steamid','author.num_games_owned','author.num_reviews','author.playtime_forever','author.playtime_last_two_weeks','author.playtime_at_review','author.last_played'],inplace=True)
train.head()
x_test_review = train

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
x_test_review[51517:51537]

In [None]:
import nltk
from nltk.tokenize import word_tokenize 
text = "word tokenize test"
word_tokenize(text)

In [None]:
train['review_tokenized'] = train['review'].apply(word_tokenize)

In [None]:
train.head()

In [None]:
import keras
MAX_NUM_WORDS = 10000 # 限制字典只能包含10000個詞彙
tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS)

In [None]:
corpus_x1 = train.review_tokenized # 建立text corpus
corpus = pd.concat([corpus_x1])
corpus.shape

In [None]:
tokenizer.fit_on_texts(corpus) # 查看text corpus後建立字典

In [None]:
x_train = tokenizer.texts_to_sequences(corpus_x1) # x1_train為一個list包含每一條review的數字序列

In [None]:
len(x_train)

In [None]:
x_train[:1] # 檢查row1的數字序列

In [None]:
for seq in x_train[:1]:
    print([tokenizer.index_word[idx] for idx in seq]) # 將索引數字對應回本來的詞彙

# dealing label

In [None]:
import numpy as np 

# 定義每一個分類對應到的索引數字
label_to_index = {
    'False': 0, 
    'True': 1
}

# 將分類標籤對應到剛定義的數字
y_train = train.voted_up.apply(
    lambda x: label_to_index[x])

y_train = np.asarray(y_train) \
            .astype('float32')

y_train[:5]

# split data to train-data & test-data

In [None]:
x_test = x_train[51517:]
x_train = x_train[:51517]

In [None]:
y_test = y_train[51517:]
y_train = y_train[:51517]

In [None]:
len(y_train)

In [None]:
len(y_test)

In [None]:
len(x_train)

In [None]:
len(x_test)

# model creation

In [None]:
x_train = tokenizer.sequences_to_matrix(x_train) #←將訓練樣本做 multi-hot 編碼
x_test  = tokenizer.sequences_to_matrix(x_test)  #←將測試樣本做 multi-hot 編碼

In [None]:
#建立模型
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()                       #←建立模型物件
model.add(Dense(16, activation='relu', input_dim=10000))  #←輸入層
model.add(Dense(16, activation='relu'))    #←隱藏層
model.add(Dense(1, activation='sigmoid'))  #←輸出層

#編譯模型
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

#訓練及驗證模型
history = model.fit(x_train, y_train,
                    batch_size=512,  #←每批次 512 筆樣本
                    epochs=10,       #←共訓練 10 週期
                    verbose = 2,     #←顯示精簡訊息 (無進度條)
                    validation_split=0.2)
                             #↑由訓練資料後面切出 20% 做為驗證用

import sys
sys.path.append(r"../input/util2py")

import util2 as u

u.plot(history.history,
       ('loss', 'val_loss'),          #←歷史資料中的 key
       'Training & Validation Loss',  #←線圖的標題
       ('Epoch','Loss'))              #←x,y 軸的名稱
u.plot(history.history,
       ('acc', 'val_acc'),            #←歷史資料中的 key
       'Training & Validation Acc',   #←線圖的標題
       ('Epoch','Acc'))               #←x,y 軸的名稱

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()                       #←建立模型物件
model.add(Dense(16, activation='relu', input_dim=10000))  #←輸入層
model.add(Dense(16, activation='relu'))    #←隱藏層
model.add(Dense(1, activation='sigmoid'))  #←輸出層

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

history = model.fit(x_train, y_train,
                    batch_size=512,
                    epochs=3,      # 只訓練 5 週期
                    verbose=2)     # 不顯示進度條

loss, acc = model.evaluate(x_test, y_test, verbose=2)  # 用測試資料評估實際的成效
print('準確率：', acc)

In [None]:
model.predict(x_test[:20])

In [None]:
model.predict_classes(x_test[:20])

In [None]:
y_test[:20].astype(int)