In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import itertools
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import optimizers, Model

### LSTM

In [48]:
names = ['Tweet', 'Label']
df = pd.read_csv('train.csv', sep=',', names=names, header=0)
#df_val = pd.read_csv('val.csv', sep=',', names=names, header=0)
#df=pd.concat((df_train, df_val))
df.dropna(how='any', inplace=True)
df.reset_index(drop=True, inplace=True)
df["Tweet"] = df['Tweet'].values.astype('U')

X = df['Tweet'].to_numpy()
y = df['Label'].to_numpy()
print(X.shape)

(6420,)


In [49]:
MAX_FEATURES = 10000

cv = CountVectorizer(max_features = MAX_FEATURES)
cv.fit(X)
X_train = cv.transform(X)
X_train = X_train.todense()
X=X_train
print('X shape is', X.shape)

X shape is (6420, 10000)


In [50]:
X=np.array(X)
y=np.array(y)

In [None]:
clf_Iso = IsolationForest(random_state=np.random.RandomState(42),n_jobs = -1)
clf_Iso.fit(X)
y_Iso_Forest = clf_Iso.predict(X)
result = np.where(y_Iso_Forest == -1)
result = list(itertools.chain.from_iterable(result))

In [None]:
X_removed = np.delete(X,result,axis = 0)
if y is None:
    X=X_removed
else:
    y_removed = np.delete(y,result,axis = 0)
X=X_removed
y=y_removed

In [51]:
look_back=1
num_samples=X.shape[0]
num_features=X.shape[1]
X = np.reshape(np.array(X), (num_samples, look_back, num_features))

In [52]:
print(X.shape)

(6420, 1, 10000)


In [53]:
batch_size=128

In [54]:
def create_model(look_back=None, input_nodes=None, activation='relu', 
                optimizer='adam', hidden_layers=2, neurons=400, hidden_units=600):
    model = keras.Sequential()
    model.add(keras.layers.LSTM(hidden_units, dropout=0.2, 
                                input_shape=(look_back, input_nodes)))
    
    for _ in range(hidden_layers):
        model.add(keras.layers.Dense(neurons, activation=activation))

    model.add(keras.layers.Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, 
                    metrics=['accuracy'])
    return model

In [55]:
epochs = 5 # can change this
kf = KFold(n_splits=3, random_state=None)
acc_list = []
X_train = None # init
X_test = None # init
y_test = None #init
# Doing cross validation testing
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = create_model(look_back=look_back, input_nodes=num_features)
    history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=epochs, batch_size=batch_size)
    print("----Start Evaluating----")
    _, acc = model.evaluate(X_test, y_test, verbose=1)
    acc_list.append(acc)
    print("Testing Accuracy:", acc)
print("Mean testing accuracy:", sum(acc_list) / len(acc_list))

Train on 4280 samples, validate on 2140 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
----Start Evaluating----
Testing Accuracy: 0.91261685
Train on 4280 samples, validate on 2140 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
----Start Evaluating----
Testing Accuracy: 0.9205608
Train on 4280 samples, validate on 2140 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
----Start Evaluating----
Testing Accuracy: 0.91728973
Mean testing accuracy: 0.9168224533398946


In [56]:
df_val=pd.read_csv('val.csv', names=names, sep=',', header=0)
df_val.dropna(how='any', inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_val["Tweet"] = df_val['Tweet'].values.astype('U')
X_val = df_val['Tweet'].to_numpy()
y_val = df_val['Label'].to_numpy()

In [57]:
X_val=cv.transform(X_val)
X_val=X_val.todense()
print(X_val.shape)

(2140, 10000)


In [58]:
num_samples_val=X_val.shape[0]
num_features_val=X_val.shape[1]
X_val = np.reshape(np.array(X_val), (num_samples_val, look_back, num_features_val))
print(X_val.shape)
_, acc_val = model.evaluate(X_val, y_val, verbose=1)

(2140, 1, 10000)


In [59]:
lstm_probs = model.predict(X_val).reshape(X_val.shape[0])

np.savetxt('lstm_probs.csv', lstm_probs, delimiter=',', header='probs')

In [60]:
lstm_preds = (lstm_probs >= 0.5).astype("int32")
np.savetxt('lstm_preds.csv', lstm_preds, delimiter=',', header='preds')

In [61]:
lstm_cm = confusion_matrix(np.array(y_val), lstm_preds)
print(lstm_cm) 

[[1029   91]
 [  91  929]]


### SVM

In [16]:
names = ['Tweet', 'Label']
df = pd.read_csv('train.csv', sep=',', names=names, header=0)
#df_val = pd.read_csv('val.csv', sep=',', names=names, header=0)
#df=pd.concat((df_train, df_val))
df.dropna(how='any', inplace=True)
df.reset_index(drop=True, inplace=True)
df["Tweet"] = df['Tweet'].values.astype('U')
X = df['Tweet'].to_numpy()
y = df['Label'].to_numpy()
print(X.shape)

(6420,)


In [17]:
MAX_FEATURES = 10000
tfidf = TfidfVectorizer(max_features = MAX_FEATURES)
tfidf.fit(X)
X_train = tfidf.transform(X)
X_train = X_train.todense()
X=X_train
print('X shape is', X.shape)

X shape is (6420, 10000)


In [18]:
X=np.array(X)
y=np.array(y)

In [None]:
clf_Iso = IsolationForest(random_state=np.random.RandomState(42),n_jobs = -1)
clf_Iso.fit(X)
y_Iso_Forest = clf_Iso.predict(X)
result = np.where(y_Iso_Forest == -1)
result = list(itertools.chain.from_iterable(result))

In [None]:
X_removed = np.delete(X,result,axis = 0)
if y is None:
    X=X_removed
else:
    y_removed = np.delete(y,result,axis = 0)
X=X_removed
y=y_removed

In [19]:
kf = KFold(n_splits=3)
svm = SVC(C=0.25, kernel='linear', probability=True)
acc_list = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    svm.fit(X_train, y_train)
    print("----Start Evaluating----")
    acc = svm.score(X_test, y_test)
    acc_list.append(acc)
    print("Testing Accuracy:", acc)
print("Mean testing accuracy:", sum(acc_list) / len(acc_list))

----Start Evaluating----
Testing Accuracy: 0.9074766355140187
----Start Evaluating----
Testing Accuracy: 0.9186915887850468
----Start Evaluating----
Testing Accuracy: 0.9121495327102803
Mean testing accuracy: 0.9127725856697819


In [20]:
df_val=pd.read_csv('val.csv', names=names, sep=',', header=0)
df_val.dropna(how='any', inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_val["Tweet"] = df_val['Tweet'].values.astype('U')
X_val = df_val['Tweet'].to_numpy()
y_val = df_val['Label'].to_numpy()
X_val=tfidf.transform(X_val)
X_val=X_val.todense()
print(X_val.shape)
acc_val = svm.score(X_val, y_val)
print('Validation accuracy:', acc_val)

(2140, 10000)
Validation accuracy: 0.9065420560747663


In [21]:
svm_probs = svm.predict_proba(X_val)[:, 1]
np.savetxt('svm_probs.csv', svm_probs, delimiter=',', header='probs')
svm_preds = svm.predict(X_val)
np.savetxt('svm_preds.csv', svm_preds, delimiter=',', header='preds')

In [22]:
svm_cm = confusion_matrix(np.array(y_val), svm_preds)
print(svm_cm)

[[988 132]
 [ 68 952]]


### LOGREG

In [23]:
names = ['Tweet', 'Label']
df = pd.read_csv('train.csv', sep=',', names=names, header=0)
#df_val = pd.read_csv('val.csv', sep=',', names=names, header=0)
#df=pd.concat((df_train, df_val))
df.dropna(how='any', inplace=True)
df.reset_index(drop=True, inplace=True)
df["Tweet"] = df['Tweet'].values.astype('U')
X = df['Tweet'].to_numpy()
y = df['Label'].to_numpy()
print(X.shape)

(6420,)


In [24]:
MAX_FEATURES = 10000
tfidf = TfidfVectorizer(max_features = MAX_FEATURES)
tfidf.fit(X)
X_train = tfidf.transform(X)
X_train = X_train.todense()
X=X_train
print('X shape is', X.shape)

X shape is (6420, 10000)


In [25]:
X=np.array(X)
y=np.array(y)

In [None]:
clf_Iso = IsolationForest(random_state=np.random.RandomState(42),n_jobs = -1)
clf_Iso.fit(X)
y_Iso_Forest = clf_Iso.predict(X)
result = np.where(y_Iso_Forest == -1)
result = list(itertools.chain.from_iterable(result))

In [None]:
X_removed = np.delete(X,result,axis = 0)
if y is None:
    X=X_removed
else:
    y_removed = np.delete(y,result,axis = 0)
X=X_removed
y=y_removed

In [26]:
num_samples = X.shape[0]
num_features = X.shape[1]
X = np.reshape(np.array(X), (num_samples, num_features))

In [27]:
C = 7.74 # to be set to the best hyperpara
solver = 'sag' # to be set to the best hyperpara
kf = KFold(n_splits=3)
logistic = LogisticRegression(max_iter=500, C=C, solver=solver)
acc_list = []
# Doing cross validation testing
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    logistic.fit(X_train, y_train)
    print("----Start Evaluating----")
    acc = logistic.score(X_test, y_test)
    acc_list.append(acc)
    print("Testing Accuracy:", acc)
print("Mean testing accuracy:", sum(acc_list) / len(acc_list))

----Start Evaluating----
Testing Accuracy: 0.9247663551401869
----Start Evaluating----
Testing Accuracy: 0.9233644859813084
----Start Evaluating----
Testing Accuracy: 0.9261682242990654
Mean testing accuracy: 0.9247663551401869


In [28]:
df_val=pd.read_csv('val.csv', names=names, sep=',', header=0)
df_val.dropna(how='any', inplace=True)
df_val.reset_index(drop=True, inplace=True)
df_val["Tweet"] = df_val['Tweet'].values.astype('U')
X_val = df_val['Tweet'].to_numpy()
y_val = df_val['Label'].to_numpy()
X_val=tfidf.transform(X_val)
X_val=X_val.todense()
print(X_val.shape)
acc_val = logistic.score(X_val, y_val)
print('Validation accuracy:', acc_val)

(2140, 10000)
Validation accuracy: 0.9177570093457944


In [29]:
logreg_probs = logistic.predict_proba(X_val)[:, 1]
np.savetxt('logreg_probs.csv', logreg_probs, delimiter=',', header='probs')
logreg_preds = logistic.predict(X_val)
np.savetxt('logreg_preds.csv', logreg_preds, delimiter=',', header='preds')

In [30]:
lr_cm = confusion_matrix(np.array(y_val), logreg_preds)
print(lr_cm)

[[1019  101]
 [  75  945]]


# CHOQUET INTEGRAL ENSEMBLE 

### Reading probability files

In [62]:
lstm_probs = pd.read_csv('lstm_probs.csv', header=0).to_numpy().astype('float32').squeeze()
svm_probs = pd.read_csv('svm_probs.csv', header=0).to_numpy().astype('float32').squeeze()
lr_probs = pd.read_csv('logreg_probs.csv', header=0).to_numpy().astype('float32').squeeze()
print(lstm_probs, svm_probs, lr_probs)

[9.9999428e-01 9.2500099e-04 9.9949324e-01 ... 9.9434763e-01 7.6771930e-02
 7.1197097e-09] [9.9526185e-01 1.1727647e-01 9.9278378e-01 ... 9.3014431e-01 8.9106023e-01
 6.6690394e-05] [9.9234599e-01 1.5449223e-01 9.8246729e-01 ... 8.8683003e-01 6.7221695e-01
 4.1882295e-04]


### Creating confidence correction vector

In [63]:
lstm_conf = np.absolute(2*lstm_probs-1)
svm_conf = np.absolute(2*svm_probs-1)
lr_conf = np.absolute(2*lr_probs-1)

### Calculating initial fuzzy density

In [64]:
#class 0
ifd_lstm_0 = lstm_cm[0][0]*lstm_cm[0][0]/(np.sum(lstm_cm, axis=0)[0]*np.sum(lstm_cm, axis=1)[0])
print(ifd_lstm_0)
ifd_svm_0 = svm_cm[0][0]*svm_cm[0][0]/(np.sum(svm_cm, axis=0)[0]*np.sum(svm_cm, axis=1)[0])
print(ifd_svm_0)
ifd_lr_0 = lr_cm[0][0]*lr_cm[0][0]/(np.sum(lr_cm, axis=0)[0]*np.sum(lr_cm, axis=1)[0])
print(ifd_lr_0)

0.8441015625
0.8253382034632034
0.8474479302689998


In [65]:
#class 1
ifd_lstm_1 = lstm_cm[1][1]*lstm_cm[1][1]/(np.sum(lstm_cm, axis=0)[1]*np.sum(lstm_cm, axis=1)[1])
print(ifd_lstm_1)
ifd_svm_1 = svm_cm[1][1]*svm_cm[1][1]/(np.sum(svm_cm, axis=0)[1]*np.sum(svm_cm, axis=1)[1])
print(ifd_svm_1)
ifd_lr_1 = lr_cm[1][1]*lr_cm[1][1]/(np.sum(lr_cm, axis=0)[1]*np.sum(lr_cm, axis=1)[1])
print(ifd_lr_1)

0.8295280661284121
0.819680196801968
0.8370121471150602


### Vectorizing ifd and conf

In [66]:
ifd = np.array([[ifd_lstm_0, ifd_svm_0, ifd_lr_0], [ifd_lstm_1, ifd_svm_1, ifd_lr_1]])
print(ifd.shape)

(2, 3)


In [67]:
conf = np.array([lstm_conf, svm_conf, lr_conf])
conf.T

array([[0.99998856, 0.9905237 , 0.984692  ],
       [0.99815   , 0.7654471 , 0.69101554],
       [0.9989865 , 0.98556757, 0.9649346 ],
       ...,
       [0.98869526, 0.8602886 , 0.77366006],
       [0.84645617, 0.78212047, 0.3444339 ],
       [1.        , 0.9998666 , 0.9991624 ]], dtype=float32)

### Calculating adjusted fuzzy density

In [68]:
#for class 0
afd_0 = ifd[0] * conf.T
afd_0
#for class 1
afd_1 = ifd[1] * conf.T
afd_1

array([[0.82951857, 0.81191266, 0.82419915],
       [0.82799343, 0.62742181, 0.5783884 ],
       [0.82868733, 0.80785022, 0.80766197],
       ...,
       [0.82015047, 0.70516155, 0.64756287],
       [0.70215915, 0.64108866, 0.28829536],
       [0.82952807, 0.81957086, 0.83631105]])

### Coefficients of quadratic for calculating lambda

In [69]:
coef_0 = np.zeros(shape=(2140, 3))
coef_0[:,0] = afd_0[:,0] * afd_0[:,1] * afd_0[:,2]
coef_0[:,1] = afd_0[:,0] * afd_0[:, 1] + afd_0[:,1] * afd_0[:,2] + afd_0[:,0] * afd_0[:,2]
coef_0[:,2] = afd_0[:,0] + afd_0[:,1] + afd_0[:,2] - 1
coef_0

array([[0.57583754, 2.07663095, 1.49608413],
       [0.3117012 , 1.39562225, 1.05989238],
       [0.56089758, 2.04063265, 1.47440444],
       ...,
       [0.38850489, 1.60524994, 1.2002249 ],
       [0.13462437, 0.85818916, 0.65189868],
       [0.58981771, 2.11006135, 1.51606776]])

In [70]:
coef_1 = np.zeros(shape=(2140, 3))
coef_1[:,0] = afd_1[:,0] * afd_1[:,1] * afd_1[:,2]
coef_1[:,1] = afd_1[:,0] * afd_1[:, 1] + afd_1[:,1] * afd_1[:,2] + afd_1[:,0] * afd_1[:,2]
coef_1[:,2] = afd_1[:,0] + afd_1[:,1] + afd_1[:,2] - 1
coef_1

array([[0.55509535, 2.02636285, 1.46563038],
       [0.30047343, 1.36129644, 1.03380365],
       [0.54069354, 1.99122438, 1.44419952],
       ...,
       [0.37451059, 1.566074  , 1.17287489],
       [0.12977508, 0.83739838, 0.63154317],
       [0.56857194, 2.05901667, 1.48540997]])

### Calculating lambda

In [71]:
lam = np.zeros(shape=(2140, 2))
for i in range(lam.shape[0]):
    lam[i, 0] = np.max(np.roots(coef_0[i, :]))
    lam[i, 1] = np.max(np.roots(coef_1[i, :]))
print(lam)

[[-0.99492468 -0.99386986]
 [-0.96926492 -0.96495004]
 [-0.9942189  -0.9930698 ]
 ...
 [-0.98023712 -0.97736181]
 [-0.88152216 -0.87201747]
 [-0.99552456 -0.99455682]]


### Getting the fuzzy densities

In [72]:
fd_0 = np.ones(shape=(2140, 3))
fd_0[:, 1] = ((1 + afd_0[:,1]*lam[:,0])*(1+lam[:,0]*afd_0[:, 2]) - 1)/lam[:,0]
fd_0[:, 2] = afd_0[:, 2]
fd_0

array([[1.        , 0.97325691, 0.83447518],
       [1.        , 0.85876878, 0.58559969],
       [1.        , 0.96983898, 0.81773182],
       ...,
       [1.        , 0.90934467, 0.65563662],
       [1.        , 0.77130824, 0.2918898 ],
       [1.        , 0.97634135, 0.84673809]])

In [73]:
fd_1 = np.ones(shape=(2140, 3))
fd_1[:, 1] = ((1 + afd_1[:,1]*lam[:,1])*(1+lam[:,1]*afd_1[:, 2]) - 1)/lam[:,1]
fd_1[:, 2] = afd_1[:, 2]
fd_1

array([[1.        , 0.97103624, 0.82419915],
       [1.        , 0.85563612, 0.5783884 ],
       [1.        , 0.96756404, 0.80766197],
       ...,
       [1.        , 0.9064254 , 0.64756287],
       [1.        , 0.76821523, 0.28829536],
       [1.        , 0.97419658, 0.83631105]])

### Reshaping probabilities

In [74]:
lstm_probs = lstm_probs.reshape((2140, 1))
svm_probs = svm_probs.reshape((2140, 1))
lr_probs = lr_probs.reshape((2140, 1))

### Calculating the Choquet Integral

In [75]:
ci = np.zeros(shape=(2140, 2))
ci[:,0] = (1-lstm_probs[:,0])*(1-fd_0[:,1]) + (1-svm_probs[:,0])*(fd_0[:,1]-fd_0[:,2]) + (1-lr_probs[:,0])*(fd_0[:,2])
ci[:,1] = (lstm_probs[:,0])*(1-fd_1[:,1]) + (svm_probs[:,0])*(fd_1[:,1]-fd_1[:,2]) + (lr_probs[:,0])*(fd_1[:,2])
ci

array([[7.04480437e-03, 9.92995668e-01],
       [8.77362429e-01, 1.22004683e-01],
       [1.54499742e-02, 9.84669175e-01],
       ...,
       [9.24337433e-02, 9.08103392e-01],
       [3.59038914e-01, 6.39229105e-01],
       [9.99636706e-01, 3.59462084e-04]])

### Saving and comparing predictions

In [76]:
preds = np.argmax(ci, axis=1).astype('int')
np.savetxt('choquet_preds.csv', preds, delimiter=',', header='Choquet Preds')

In [77]:
df_val=pd.read_csv('val.csv', names=['Tweet', 'Label'], sep=',', header=0)
true_preds = df_val['Label'].to_numpy().astype('int32')
lstm_preds=pd.read_csv('lstm_preds.csv', header=0).to_numpy().astype('int32')
svm_preds=pd.read_csv('svm_preds.csv', header=0).to_numpy().astype('int32')
lr_preds=pd.read_csv('logreg_preds.csv', header=0).to_numpy().astype('int32')

In [78]:
print('ACCURACY:',accuracy_score(true_preds, preds), end='\n\n')
print(classification_report(true_preds, preds))

ACCURACY: 0.9210280373831776

              precision    recall  f1-score   support

           0       0.93      0.92      0.92      1120
           1       0.91      0.92      0.92      1020

    accuracy                           0.92      2140
   macro avg       0.92      0.92      0.92      2140
weighted avg       0.92      0.92      0.92      2140



In [79]:
print('ACCURACY:',accuracy_score(true_preds, lstm_preds), end='\n\n')
print(classification_report(true_preds, lstm_preds))

ACCURACY: 0.9149532710280374

              precision    recall  f1-score   support

           0       0.92      0.92      0.92      1120
           1       0.91      0.91      0.91      1020

    accuracy                           0.91      2140
   macro avg       0.91      0.91      0.91      2140
weighted avg       0.91      0.91      0.91      2140



In [80]:
print('ACCURACY:',accuracy_score(true_preds, svm_preds), end='\n\n')
print(classification_report(true_preds, svm_preds))

ACCURACY: 0.9065420560747663

              precision    recall  f1-score   support

           0       0.94      0.88      0.91      1120
           1       0.88      0.93      0.90      1020

    accuracy                           0.91      2140
   macro avg       0.91      0.91      0.91      2140
weighted avg       0.91      0.91      0.91      2140



In [81]:
print('ACCURACY:',accuracy_score(true_preds, lr_preds), end='\n\n')
print(classification_report(true_preds, lr_preds))

ACCURACY: 0.9177570093457944

              precision    recall  f1-score   support

           0       0.93      0.91      0.92      1120
           1       0.90      0.93      0.91      1020

    accuracy                           0.92      2140
   macro avg       0.92      0.92      0.92      2140
weighted avg       0.92      0.92      0.92      2140

