In [1]:
import numpy as np
import pandas as pd
import sklearn
import tensorflow
import keras
import seaborn as sns
from nltk.corpus import stopwords
import gensim
#
#import demoji
#from sklearn.preprocessing import LabelEncoder
#from tensorflow.keras.preprocessing.sequence import pad_sequences
#from tensorflow.keras import models
#from keras.models import Sequential
#from keras.layers import Embedding
#from keras.layers import Flatten
#from keras.layers import Dense
#from tensorflow.keras import layers
#from tensorflow.keras import callbacks
#from sklearn.model_selection import train_test_split, GridSearchCV, KFold
#from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.feature_extraction.text import TfidfTransformer
#from tensorflow.keras.preprocessing.text import Tokenizer
#from sklearn.linear_model import LogisticRegression
#from sklearn.metrics import confusion_matrix

^C
Note: you may need to restart the kernel to use updated packages.


In [None]:
NB_WORDS = 10000 # Parameter indicating the number of words we'll put in the dictionary
NB_EPOCHS = 5 # Number of epochs we usually start to train with
BATCH_SIZE = 32 # Size of the batches used in the mini-batch gradient descent
MAX_LEN = 100 # Maximum number of words in a sequence
FILTER_STRING='!"#$%&()*+,-./:;<=>?@[\]^_`{"}~\t\n'
EMBEDDING_SIZE=100 # Size of the word embedding
PATIENCE=10 # Patience level
DROP_RATE=0.4 # Dropout rate

In [None]:
def transformText(text):
    stops = set(stopwords.words("english"))
    #Delete emoji
    text = demoji.replace(text, "")
    # Convert text to lowercase
    text = text.lower()
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    # Removing all the stopwords
    filtered_words = [word for word in text.split() if word not in stops]
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation(text)
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    # Removing all the words with < 3 characters
    text = gensim.parsing.preprocessing.strip_short(text, minsize=3)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    # Stemming
    return gensim.parsing.preprocessing.stem_text(text)

In [None]:
dataset= pd.read_csv('train_40k.csv')
dataset

In [None]:
#explorative analysis
print('\nData shape')
print(dataset.shape)
print('\nData describe')
print(dataset.describe())
print('\nData dtypes')
print(dataset.dtypes)
print('\nCount Nan')
print(dataset.isna().sum())
print('\nVerify values of variable target')
print(dataset['Cat1'].unique())
print('\nCount values of variable target')
print(dataset['Cat1'].value_counts())

In [None]:
features_to_drop = ['Title', 'userId', 'Helpfulness', 'Score', 'Time','Cat2'
                    ,'Cat3']
dataset = dataset.drop(features_to_drop, axis=1)

In [None]:
#applies transformText to all rows of text
dataset['Text'] = dataset['Text'].map(transformText)
print(dataset['Text'].head())

In [None]:
numero_classi=dataset['Cat1'].nunique()
numero_classi

In [None]:
label_encoder = LabelEncoder()
dataset['Cat1'] = label_encoder.fit_transform(dataset['Cat1'])

In [None]:
#operiamo un train test
X_trainAll, X_test, y_trainAll, y_test = train_test_split(dataset['Text'], dataset['Cat1'],
                                                          test_size=0.10, random_state=10)
print ("Training Sample Size:", len(X_trainAll), ' ', "Test Sample Size:" ,len(X_test))

X_train, X_valid, y_train, y_valid = train_test_split(X_trainAll, y_trainAll,
                                                          test_size=0.20, random_state=10)
print ("Training Sample Size:", len(X_train), ' ', "Validation Sample Size:" ,len(X_valid))

In [None]:
# Tokenizzazione del testo
tokenizer = Tokenizer(num_words=NB_WORDS ,filters=FILTER_STRING , oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
voc_len=len(tokenizer.word_index)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_valid_seq = tokenizer.texts_to_sequences(X_valid)
X_test_seq = tokenizer.texts_to_sequences(X_test)
voc_len

In [None]:
#Padding delle sequenze
X_train_padded = pad_sequences(X_train_seq, maxlen=100)
X_valid_padded=pad_sequences(X_valid_seq, maxlen=100)
X_test_padded = pad_sequences(X_test_seq, maxlen=100)

In [None]:
# Costruzione del modello
model = Sequential()
model.add(Embedding(voc_len+1, 100, input_length=100))
model.add(layers.Dropout(DROP_RATE))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(layers.Dropout(DROP_RATE))
model.add(Dense(numero_classi+1, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# Addestramento del modello
history = model.fit(X_train_padded, y_train, epochs=NB_EPOCHS,
                    validation_data=(X_valid_padded, y_valid),batch_size=BATCH_SIZE)

In [None]:
loss, accuracy = model.evaluate(X_train_padded, y_train)
print(f'Accuracy: {accuracy*100:.2f}%')

loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f'Accuracy: {accuracy*100:.2f}%')


import matplotlib.pyplot as plt
pd.DataFrame(history.history).plot()
plt.grid(True)
plt.show()

In [None]:
y_pred = np.argmax(model.predict(X_test_padded), axis=-1)
confusion_mtx = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10,8))
sns.heatmap(confusion_mtx, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_encoder.classes_,
            yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()