In [1]:
import pandas as pd

neg=pd.read_csv('/kaggle/input/datasetsandmodel/neg.csv',header=None,index_col=None)
pos=pd.read_csv('/kaggle/input/datasetsandmodel/pos.csv',header=None,index_col=None,on_bad_lines='skip')
neu=pd.read_csv('/kaggle/input/datasetsandmodel/neutral.csv', header=None, index_col=None)

In [2]:
import numpy as np

combined = np.concatenate((pos[0], neu[0], neg[0]))
combined.shape

(21088,)

In [27]:
# pos -> 1; neu -> 0; neg -> -1
y = np.concatenate((np.ones(len(pos), dtype=int), np.zeros(len(neu), dtype=int), -1*np.ones(len(neg),dtype=int)))
y.shape

(21088,)

In [3]:
import jieba

# 对句子进行分词，并去掉换行符
def tokenizer(text):
    ''' Simple Parser converting each document to lower-case, then
        removing the breaks for new lines and finally splitting on the
        whitespace
    '''
    text = [jieba.lcut(str(document).replace('\n', '')) for document in text]  # Ensure document is a string
    return text

combined = tokenizer(combined)
len(combined)

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.963 seconds.
Prefix dict has been built successfully.


21088

In [25]:
from gensim.models.word2vec import Word2Vec
from gensim.corpora.dictionary import Dictionary
from keras.preprocessing import sequence
import multiprocessing

cpu_count = multiprocessing.cpu_count() # 4
vocab_dim = 100
n_iterations = 10  # ideally more..
n_exposures = 10 # 所有频数超过10的词语
window_size = 7
n_epoch = 4
input_length = 100
maxlen = 100

def create_dictionaries(model=None,
                        combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries
    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.index_to_key,
                            allow_update=True)
        #  freqxiao10->0 所以k+1
        w2indx = {v: k+1 for k, v in gensim_dict.items()} # 所有频数超过10的词语的索引, (k->v)=>(v->k)
        w2vec = {word: model.wv[word] for word in w2indx.keys()}  # 所有频数超过10的词语的词向量, (word->model(word))

        def parse_dataset(combined):  # 闭包-->临时使用
            ''' Words become integers '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)  # freqxiao10->0
                data.append(new_txt)
            return data  # word=>index
        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(combined, maxlen=maxlen)  # 每个句子所含词语对应的索引，所以句子中含有频数小于10的词语，索引为0
        return w2indx, w2vec, combined
    else:
        print('No data provided...')


# 创建词语字典，并返回每个词语的索引，词向量，以及每个句子所对应的词语索引
def word2vec_train(combined):
    model = Word2Vec(vector_size=vocab_dim,  # Changed 'size' to 'vector_size'
                     min_count=n_exposures,
                     window=window_size,
                     workers=cpu_count,
                     epochs=n_iterations)  # 'iter' replaced with 'epochs'
    model.build_vocab(combined)  # input: list
    model.train(combined, total_examples=model.corpus_count, epochs=model.epochs)  # Update the training call
    model.save('Word2vec_model.pkl')
    index_dict, word_vectors, combined = create_dictionaries(model=model, combined=combined)
    return index_dict, word_vectors, combined

print('Training a Word2vec model...')
index_dict, word_vectors, combined = word2vec_train(combined)


Training a Word2vec model...


In [58]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, load_model  # Updated import
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Activation  # Updated import
import numpy as np
import keras
import sys
n_epoch = 10
np.random.seed(1337)  # For Reproducibility
sys.setrecursionlimit(1000000)

batch_size = 32

def get_data(index_dict, word_vectors, combined, y):
    n_symbols = len(index_dict) + 1  # 所有单词的索引数，频数小于10的词语索引为0，所以加1
    embedding_weights = np.zeros((n_symbols, vocab_dim))  # 初始化 索引为0的词语，词向量全为0
    for word, index in index_dict.items():  # 从索引为1的词语开始，对每个词语对应其词向量
        embedding_weights[index, :] = word_vectors[word]
    x_train, x_test, y_train, y_test = train_test_split(combined, y, test_size=0.2)
    y_train = keras.utils.to_categorical(y_train, num_classes=3) 
    y_test = keras.utils.to_categorical(y_test, num_classes=3)
    return n_symbols, embedding_weights, x_train, y_train, x_test, y_test

# Define F1 score function

from tensorflow.keras import backend as K

def f1_score(y_true, y_pred):
    # 获取类别数
    num_classes = K.int_shape(y_true)[-1]

    # 将 y_pred 和 y_true 转换为类别标签
    y_pred = K.argmax(y_pred, axis=-1)
    y_true = K.argmax(y_true, axis=-1)

    # 初始化 F1-score
    f1 = 0.0

    # 对每个类别计算 F1-score
    for i in range(num_classes):
        # 计算该类别的 true positives, predicted positives 和 possible positives
        true_positives = K.sum(K.cast(K.equal(y_true, i) & K.equal(y_pred, i), 'float32'))
        predicted_positives = K.sum(K.cast(K.equal(y_pred, i), 'float32'))
        possible_positives = K.sum(K.cast(K.equal(y_true, i), 'float32'))

        # 计算该类别的 precision 和 recall
        precision = true_positives / (predicted_positives + K.epsilon())
        recall = true_positives / (possible_positives + K.epsilon())

        # 计算该类别的 F1-score
        f1_class = 2 * (precision * recall) / (precision + recall + K.epsilon())
        
        # 将该类别的 F1-score 加入总的 F1-score
        f1 += f1_class

    # 计算平均 F1-score
    f1 = f1 / num_classes
    return f1
# 定义网络结构
def train_lstm(n_symbols, embedding_weights, x_train, y_train, x_test, y_test):
    print('Defining a Simple Keras Model...')
    model = Sequential()  # or Graph or whatever
    model.add(Embedding(output_dim=vocab_dim,
                        input_dim=n_symbols,
                        mask_zero=True,
                        weights=[embedding_weights],
                        input_length=input_length))  # Adding Input Length
    model.add(LSTM(units=50, activation='tanh', recurrent_activation='hard_sigmoid'))  # Updated LSTM
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))  # Dense=>全连接层, 输出维度=3
    model.add(Activation('softmax'))

    print('Compiling the Model...')
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam', metrics=['accuracy',f1_score])

    print("Train...")  # batch_size=32
    model.fit(x_train, y_train, batch_size=batch_size, epochs=n_epoch, verbose=1)

    print("Evaluate...")
    score = model.evaluate(x_test, y_test, batch_size=batch_size)

    # Save the model directly (no YAML needed)
    model.save('lstm_model.h5')  # Save the whole model
    print('Test score:', score)

# If you need to load the model later, use:
# model = load_model('../model/lstm_model.h5')

print('Setting up Arrays for Keras Embedding Layer...')
n_symbols, embedding_weights, x_train, y_train, x_test, y_test = get_data(index_dict, word_vectors, combined, y)
print("x_train.shape and y_train.shape:")
print(x_train.shape, y_train.shape)
train_lstm(n_symbols, embedding_weights, x_train, y_train, x_test, y_test)


Setting up Arrays for Keras Embedding Layer...
x_train.shape and y_train.shape:
(16870, 100) (16870, 3)
Defining a Simple Keras Model...
Compiling the Model...
Train...
Epoch 1/10
[1m528/528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 59ms/step - accuracy: 0.5453 - f1_score: 0.4119 - loss: 0.9786
Epoch 2/10
[1m528/528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 58ms/step - accuracy: 0.7443 - f1_score: 0.6812 - loss: 0.8058
Epoch 3/10
[1m528/528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 58ms/step - accuracy: 0.8512 - f1_score: 0.8545 - loss: 0.7019
Epoch 4/10
[1m528/528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 58ms/step - accuracy: 0.8899 - f1_score: 0.8890 - loss: 0.6631
Epoch 5/10
[1m528/528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 58ms/step - accuracy: 0.8724 - f1_score: 0.8767 - loss: 0.6810
Epoch 6/10
[1m528/528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 58ms/step - accuracy: 0.8960 - f1_score: 0.8979 

In [37]:
import jieba
import numpy as np
from gensim.models.word2vec import Word2Vec
from gensim.corpora.dictionary import Dictionary
from keras.preprocessing import sequence
from tensorflow.keras.models import load_model  # Updated import

np.random.seed(1337)  # For Reproducibility
import sys
sys.setrecursionlimit(1000000)

# Define parameters
maxlen = 100

def create_dictionaries(model=None, combined=None):
    ''' Function does a number of jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries
    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        # Use model.wv instead of model.vocab
        gensim_dict.doc2bow(model.wv.index_to_key, allow_update=True)  # Updated line
        # freqxiao10->0 so k+1
        w2indx = {v: k+1 for k, v in gensim_dict.items()}  # All frequent words indexed
        w2vec = {word: model.wv[word] for word in w2indx.keys()}  # Word vectors for frequent words

        def parse_dataset(combined):  # closure for temporary use
            ''' Words become integers '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)  # freqxiao10->0
                data.append(new_txt)
            return data  # Word => Index
        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(combined, maxlen=maxlen)  # Pad sequences
        return w2indx, w2vec, combined
    else:
        print('No data provided...')


def input_transform(string):
    words = jieba.lcut(string)
    words = np.array(words).reshape(1, -1)
    model = Word2Vec.load('/kaggle/working/Word2vec_model.pkl')
    _, _, combined = create_dictionaries(model, words)
    return combined


def lstm_predict(string):
    #print('Loading model......')
    model = load_model('/kaggle/working/lstm_model.h5')  # Load the model

    #print('Preparing input data......')
    data = input_transform(string)
    data = data.reshape(1, -1)

    #print('Predicting...')
    result = model.predict(data)  # Make prediction
    predicted_class = np.argmax(result, axis=1)  # Get the class with the highest probability

    # print(result)  # Display the prediction probabilities
    if predicted_class[0] == 1:
        print(string, 'positive')
    elif predicted_class[0] == 0:
        print(string, 'neutral')
    else:
        print(string, 'negative')
    return predicted_class[0]


In [59]:
string = "但是内容实在没意思"
# string = "真的一般，没什么可以学习的"

lstm_predict(string)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 254ms/step
但是内容实在没意思 neutral


0

In [44]:
# string='酒店的环境非常好，价格也便宜，值得推荐'
# string='手机质量太差了，傻逼店家，赚黑心钱，以后再也不会买了'
# string = "这是我看过文字写得很糟糕的书，因为买了，还是耐着性子看完了，但是总体来说不好，文字、内容、结构都不好"
# string = "虽说是职场指导书，但是写的有点干涩，我读一半就看不下去了！"
# string = "书的质量还好，但是内容实在没意思。本以为会侧重心理方面的分析，但实际上是婚外恋内容。"
# string = "不是太好"
# string = "不错不错"




# Initialize counters for each category
# positive_count = 0
# neutral_count = 0
# negative_count = 0

# # Load the CSV file
# text = pd.read_csv('/kaggle/input/ddddddd/comments1.csv', header=None, index_col=None)

# # Extract comments from the second column (index 1)
# comments = text.iloc[:, 1]

# # Iterate over each comment and predict using lstm_predict
# for comment in comments:
#     #print(f"Predicting for comment: {comment}")
#     result = lstm_predict(comment)  # Assuming lstm_predict returns the prediction result

#     if result == 1:
#         positive_count += 1
#     elif result == 0:
#         neutral_count += 1
#     else:
#         negative_count += 1

# # Print the statisticsprint("\nClassification Statistics:")
# print(f"Positive: {positive_count}")
# print(f"Neutral: {neutral_count}")
# print(f"Negative: {negative_count}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 231ms/step
但是内容实在没意思 neutral


0