In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import KFold
import itertools
from tensorflow import keras
from tensorflow.keras import optimizers, Model

### LOADING DATASET

In [2]:
names = ['Tweet', 'Label']
df = pd.read_csv('fnn_train.csv', sep=',', names=names, header=0)
#df_val = pd.read_csv('val.csv', sep=',', names=names, header=0)
#df=pd.concat((df_train, df_val))
df.dropna(how='any', inplace=True)
df.reset_index(drop=True, inplace=True)
df["Tweet"] = df['Tweet'].values.astype('U')

In [3]:
X = df['Tweet'].to_numpy()
y = df['Label'].to_numpy()
print(X.shape)

(15212,)


### VECTORIZING DATASET

In [4]:
MAX_FEATURES = 10000

In [5]:
cv = CountVectorizer(max_features = MAX_FEATURES)
cv.fit(X)
X_train = cv.transform(X)
X_train = X_train.todense()
X=X_train
print('X shape is', X.shape)

X shape is (15212, 10000)


### REMOVING OUTLIERS

In [9]:
X=np.array(X)
y=np.array(y)

In [10]:
clf_Iso = IsolationForest(random_state=np.random.RandomState(42),n_jobs = -1)
clf_Iso.fit(X)
y_Iso_Forest = clf_Iso.predict(X)
result = np.where(y_Iso_Forest == -1)
result = list(itertools.chain.from_iterable(result))

In [11]:
X_removed = np.delete(X,result,axis = 0)
if y is None:
    X=X_removed
else:
    y_removed = np.delete(y,result,axis = 0)
X=X_removed
y=y_removed

In [12]:
print(X.shape, y.shape)

(8558, 10000) (8558,)


### SETTING PARAMETERS

In [6]:
look_back=1
num_samples=X.shape[0]
num_features=X.shape[1]
X = np.reshape(np.array(X), (num_samples, look_back, num_features))

In [7]:
print(X.shape)

(15212, 1, 10000)


In [8]:
batch_size=128

In [9]:
def create_model(look_back=None, input_nodes=None, activation='relu', 
                optimizer='adam', hidden_layers=2, neurons=400, hidden_units=600):
    model = keras.Sequential()
    model.add(keras.layers.LSTM(hidden_units, dropout=0.2, 
                                input_shape=(look_back, input_nodes)))
    
    for _ in range(hidden_layers):
        model.add(keras.layers.Dense(neurons, activation=activation))

    model.add(keras.layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, 
                    metrics=['accuracy'])
    return model

### EVALUATING MODEL WITH CROSS VALIDATION

In [10]:
epochs = 4 # can change this
kf = KFold(n_splits=3, random_state=None)
acc_list = []
X_train = None # init
X_test = None # init
y_test = None #init
# Doing cross validation testing
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = create_model(look_back=look_back, input_nodes=num_features)
    history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=batch_size)
    print("----Start Evaluating----")
    _, acc = model.evaluate(X_test, y_test, verbose=1)
    acc_list.append(acc)
    print("Testing Accuracy:", acc)
print("Mean testing accuracy:", sum(acc_list) / len(acc_list))

Train on 10141 samples, validate on 5071 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
----Start Evaluating----
Testing Accuracy: 0.6560836
Train on 10141 samples, validate on 5071 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
----Start Evaluating----
Testing Accuracy: 0.6635772
Train on 10142 samples, validate on 5070 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
----Start Evaluating----
Testing Accuracy: 0.67278105
Mean testing accuracy: 0.6641472776730856


### RUNNING MODEL ON VALIDATION DATASET

In [17]:
df_val=pd.read_csv('fnn_test.csv', names=names, sep=',', header=0)
df_val.dropna(how='any', inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_val["Tweet"] = df_val['Tweet'].values.astype('U')
X_val = df_val['Tweet'].to_numpy()
y_val = df_val['Label'].to_numpy()

In [18]:
print(X_val.shape)

(1058,)


In [19]:
X_val=cv.transform(X_val)
X_val=X_val.todense()
print(X_val.shape)

(1058, 10000)


In [20]:
num_samples_val=X_val.shape[0]
num_features_val=X_val.shape[1]
X_val = np.reshape(np.array(X_val), (num_samples_val, look_back, num_features_val))
print(X_val.shape)

(1058, 1, 10000)


In [21]:
_, acc_val = model.evaluate(X_val, y_val, verbose=1)



In [22]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 600)               25442400  
_________________________________________________________________
dense_6 (Dense)              (None, 400)               240400    
_________________________________________________________________
dense_7 (Dense)              (None, 400)               160400    
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 401       
Total params: 25,843,601
Trainable params: 25,843,601
Non-trainable params: 0
_________________________________________________________________
