In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.callbacks import ModelCheckpoint
from keras.layers import BatchNormalization
from keras.preprocessing import text
from keras import utils, callbacks

import numpy as np



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
df = pd.read_csv('drive/MyDrive/PJAIT/ZUM/Twitter_Clustered_Data.csv')

# ETAP 3: NEURAL MODEL

In [4]:
my_data = df.copy()

my_data['Cluster'] = my_data['Cluster'].astype('category')
#my_data["Id"] = my_data['Cluster'].cat.codes


## Since the neural model will last so long I will use just the 50K row of the data

In [5]:
my_data = my_data.sample(50000).reset_index(drop=True)

In [6]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=1)
data = my_data['tweet']
target = my_data['Cluster']
for train_index, test_index in sss.split(data, target):
    train_narrative, test_narrative = data[train_index], data[test_index]
    train_product, test_product = target[train_index], target[test_index]

In [7]:
def model_creation(word_count, layer_count, drop_rate):
    max_words = word_count
    tokenize = text.Tokenizer(num_words=max_words, char_level=False)
    tokenize.fit_on_texts(train_narrative) # only fit on train
    x_train = tokenize.texts_to_matrix(train_narrative)
    x_test = tokenize.texts_to_matrix(test_narrative)
  
    encoder = LabelEncoder()
    encoder.fit(train_product)
    y_train = encoder.transform(train_product)
    y_test = encoder.transform(test_product)
  
    num_classes = np.max(y_train) + 1
    y_train = utils.to_categorical(y_train, num_classes)
    y_test = utils.to_categorical(y_test, num_classes)
  
    #MODELLING
    model = Sequential()
    model.add(Dense(layer_count, input_shape=(max_words,)))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(drop_rate))
  
    model.add(Dense(layer_count))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(drop_rate))
  
    model.add(Dense(layer_count))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(drop_rate))
  
    model.add(Dense(num_classes))
    model.add(Activation('softmax'))
  
    my_callbacks = [
        callbacks.ModelCheckpoint(f"model_{str(word_count)}_{str(layer_count)}.keras", save_best_only=True)
    ]
    my_callbacks.append(callbacks.EarlyStopping(monitor='val_loss', patience=2))
  
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    history = model.fit(x_train, y_train, batch_size=64, epochs=10, verbose=0, validation_split=0.1, callbacks=my_callbacks)
    score = model.evaluate(x_test, y_test, batch_size=64, verbose=1)
    return score

In [8]:
my_optimality_data = pd.DataFrame()
for word_count in [2000, 5000]:
    for layer_count in [64, 256]:
        score = model_creation(word_count,layer_count,0.5)
        print(f"word_count = {word_count}, layer_count = {layer_count}")
        print(score[1])
        my_optimality_data = pd.concat([my_optimality_data, pd.DataFrame( {"max_word":[word_count],"layer_count":[layer_count],"drop_rate":[0.5], "ACCURACY":[score[1]]})])

word_count = 2000, layer_count = 64
0.9456999897956848
word_count = 2000, layer_count = 256
0.944599986076355
word_count = 5000, layer_count = 64
0.9387999773025513
word_count = 5000, layer_count = 256
0.9376999735832214


In [10]:
my_optimality_data

Unnamed: 0,max_word,layer_count,drop_rate,ACCURACY
0,2000,64,0.5,0.9457
0,2000,256,0.5,0.9446
0,5000,64,0.5,0.9388
0,5000,256,0.5,0.9377


## We see that 2000 max word and 64 layer count gives the best model for our data.