In [1]:
import os
import time
import pickle
import pandas as pd
import numpy as np


import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F

from gensim.utils import simple_preprocess, tokenize
from gensim.summarization.textcleaner import split_sentences
from gensim.parsing import preprocessing
from gensim.models import Doc2Vec
from gensim.corpora import Dictionary


from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, classification_report, f1_score
from sklearn.model_selection import train_test_split

from sklearn.cluster import KMeans
from sklearn import metrics # silhouette
from sklearn.preprocessing import minmax_scale

import matplotlib.pyplot as plt
from multiprocessing import cpu_count

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from tqdm import tqdm

# pd.set_option('display.max_colwidth',1000)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

%matplotlib inline

cuda


In [2]:
df = pd.read_json('data/dev/cluster_12_cats_index.json')

In [8]:
# df.drop(['title', 'body', 'subjects', 'date'], axis=1, inplace=True)

In [4]:
# df.to_json('data/dev/cluster_12_cats_index.json')

In [2]:
def get_clusters(labels, df, ids):
    df2 = df.reset_index()
    df2 = df2.set_index('id')

    indexes = df2.loc[ids, 'index'].tolist()
    
    clusters = labels[indexes]
    
    return clusters
    

def visualize_trending(df):
#     target = df[df['target'] == True]
#     noise = df[df['target'] == False]
#     t_group = target.groupby(['time'])['id'].count()
#     n_group = noise.groupby(['category'])
    
    fig, ax = plt.subplots(figsize=(20, 10))
    # ax.plot(t_group.index, t_group.values, label='target')
    
    for name, group in df.groupby(['category']):
        g = group.groupby(['time'])['id'].count()
        ax.plot(g.index, g.values, label=name)
    
    ax.set(xlabel='time', ylabel='Numbers')
    ax.legend()
    ax.grid()

    plt.show()
    
def get_target(df):
    target = df[df['event'] != 'noise']['category'].iloc[0]
    return target


# def get_cluster_change(clusters, sample):
#     changes = []
    
#     for i, ids in enumerate(clusters):
#         # merge article index
#         cluster = sample.iloc[ids]
#         counts = cluster['time'].value_counts().sort_index()
#         maj_class = cluster['category'].value_counts(normalize=True).index[0]
#         # diff = times.diff().fillna(0)
#         changes.append((i, maj_class, counts))
        
#     return changes

def get_cluster_change(clusters, sample, target, timeline=100):
    # sample['cluster'] = clusters
    changes = []
    
    for i, ids in enumerate(clusters):
        cluster = sample.iloc[ids]
        # count and sort by timeline
        counts = cluster['time'].value_counts().sort_index()
        
        if len(counts) < timeline:
            miss_times = list(set(range(timeline)).difference(counts.index))
            fill_values = pd.Series(np.zeros(len(miss_times)), index=miss_times)
            counts = counts.append(fill_values)
            counts = counts.sort_index()
            
        # print(cluster['time'].nunique())
        class_count = cluster['category'].value_counts(normalize=True)
        maj_class = class_count.index[0]
        maj_percent = class_count[0]
        # diff = times.diff().fillna(0)
        # print(class_count[0])
        is_target = (maj_class == target) and maj_percent > 0.5
        
        changes.append((counts.values, is_target, maj_class, maj_percent))
        
    return changes


In [5]:
#models = kmean_cluster(doc_vecs, df, 2000, n_clusters=20)

In [6]:
#pickle.dump(models, open('data/dev/kmean_2k_models.pkl', 'wb'))

In [3]:
dataset = pickle.load(open("data/dev/dataset_1_event.pkl", "rb"))

In [22]:
def get_all_freq_change(samples, offset=0):
    
    all_changes = []
    
    for i, sample in enumerate(tqdm(samples)):
        #sample = pd.read_json('data/dev/samples/sample_' + str(i) + '.json')
        sample.reset_index(inplace=True, drop=True)
        #sample_vecs = get_doc_vecs(doc_vecs, df, sample['id'])
        target = get_target(sample)
        clusters = pickle.load(open("models/lda_cluster/clusters_" + str(i + offset) + ".pkl", "rb"))
        freq_change = get_cluster_change(clusters, sample, target)
        
        all_changes += freq_change
    
    return all_changes



def scale_x(X):
    X = X + 1
    return X / X[:, 0][:, None]

In [18]:
sample = dataset[0]

In [20]:
target = get_target(sample)
clusters = get_clusters(clustering.labels_, df, sample['id'])
changes = get_cluster_change(clusters, sample, target)

0.9269562388156825
0.9246871444823663
0.9649122807017544
0.7701983551040155
0.3381881991664839
0.8306962025316456
0.7484472049689441
0.8065043604651163
0.3128491620111732
0.7507712290956324
0.8889545186060248
0.818307426597582


In [5]:
train_samples = dataset[200:]
test_samples = dataset[:200]

In [23]:
train_data = get_all_freq_change(train_samples, offset=200)

100%|██████████| 1800/1800 [01:58<00:00, 15.18it/s]


In [24]:
test_data = get_all_freq_change(test_samples, offset=0)

100%|██████████| 200/200 [00:13<00:00, 15.19it/s]


In [17]:
test_data[0]

(array([ 85,  82,  83,  79,  77,  86,  92,  76,  83,  99,  92,  83,  89,
         90,  91,  83,  86,  92,  87,  85,  86,  79,  91,  82,  92,  76,
         92,  84,  96,  83,  89,  81,  76,  79,  90,  94,  80,  72,  90,
        100, 103,  88,  95,  72,  95,  85,  88,  82,  84,  92,  90,  87,
         82,  82,  90,  94,  82,  94,  77,  75,  93,  89,  84,  90,  94,
         85,  89,  76,  86,  91,  93,  88,  96,  86,  93,  98,  75,  86,
         80,  74,  83,  77,  84,  88,  94,  92,  93,  85,  83,  92,  85,
         92,  82,  97,  84,  97,  93, 110,  81,  96]),
 False,
 'vaalit',
 0.5980696311616683)

In [25]:
X, y, c, _ = zip(*train_data)
X_test, y_test, maj_cats, _ = zip(*test_data)

In [26]:
X = np.vstack(X)
y = np.array(y)
X_test = np.vstack(X_test)
y_test = np.array(y_test)

In [27]:
ros = RandomOverSampler(random_state=12)
X_resampled, y_resampled = ros.fit_resample(X, y)

In [28]:
# X_resampled = minmax_scale(X_resampled, axis=1)
# X_test = minmax_scale(X_test, axis=1)
X_resampled = scale_x(X_resampled)
X_test = scale_x(X_test)

In [17]:
X_resampled.shape, y_resampled.shape

((65398, 100), (65398,))

In [27]:
def visualize_change(sample, model, df):    
    fig, ax = plt.subplots(figsize=(20, 10))
    target = get_target(sample)
    clusters = get_clusters(model.labels_, df, sample['id'])
    changes = get_cluster_change(clusters, sample, target)
    index = np.arange(100)
    distance = 0
    
    counts, is_target, cats, target_ratio = zip(*changes)
    counts = np.array(counts) + 1
    # print(counts.shape)
    counts = counts / counts[:, 0][:,None]
    # print(counts[-1])
    
    
    for i, change in enumerate(is_target):
        val = counts[i] + distance
        
        if change:
            print(target_ratio[i])
            ax.plot(index, val, ls='--')
        else:
            ax.plot(index, val)
            
        distance += 1
    
    ax.set(xlabel='Time', ylabel='Counts')
    # ax.legend()
    ax.grid()

    plt.show()


In [28]:
randid = np.random.randint(2000)
sample = dataset[randid]
print(randid)
visualize_change(sample, clustering, df)

1397


NameError: name 'clustering' is not defined

In [29]:
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.1)

In [110]:
X_val.shape

(6540, 100)

In [30]:
class FreqDataset(Dataset):

    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        X, y = self.X[idx], self.y[idx]
        
        
        X = torch.tensor(X, dtype=torch.float)
        y = torch.tensor(y, dtype=torch.float)

        return X, y


In [31]:
train_set = FreqDataset(X_train, y_train)
val_set = FreqDataset(X_val, y_val)
test_set = FreqDataset(X_test, y_test)

train_loader = DataLoader(train_set, batch_size=128, shuffle=True, num_workers=4)
val_loader = DataLoader(val_set, batch_size=128, shuffle=False, num_workers=4)
test_loader = DataLoader(test_set, batch_size=128, shuffle=False, num_workers=4)

In [32]:
class RNN(nn.Module):
    
    def __init__(self, seq_len=100, input_size=1, hidden_size=256, output_size=1, dropout=0.3):
        super().__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size

        self.lstm = nn.LSTM(input_size, hidden_size, bidirectional=True, batch_first=True)
        self.fc1 = nn.Linear(seq_len * hidden_size * 2, hidden_size)
        # self.fc1 = nn.Linear(hidden_size * 2, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)

        # self.hidden_cell = torch.zeros(2, 1, self.hidden_size)

    def forward(self, input_seq):
        batch_size = input_seq.shape[0]
        
        lstm_out, (hid, cel) = self.lstm(input_seq.view(batch_size ,-1, self.input_size))
#         print('lstm', lstm_out.shape)
#         print('hid', hid[0].shape)
        # out = self.fc1(hid.reshape(batch_size, -1))
        out = self.fc1(lstm_out.reshape(batch_size, -1))
        out = self.fc2(self.dropout(out))
        
        return out

In [33]:
model = RNN(dropout=0).to(device)
optimizer = Adam(model.parameters(), lr=1e-4)
criterion = nn.BCEWithLogitsLoss()
# criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([10]).to(device))

In [34]:
def train(model, data_loader, optimizer, criterion):
    model.train()
    epoch_loss = 0
    # for i, batch in tqdm(enumerate(data_loader), total=len(data_loader), desc='Train'):
    for batch in data_loader:
        inputs, labels = batch
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        outputs = model(inputs)
#         print('inputs', inputs.shape)
#         print('output', outputs.shape)
#         print('labels', labels.shape)
        outputs = outputs.squeeze(1)
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
#         print(outputs)
#         print(labels)
        
        optimizer.zero_grad()
        loss = criterion(outputs, labels)
        loss.backward()

        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(data_loader)

def evaluate(model, data_loader, criterion):
    model.eval()
    epoch_loss = 0
    accuracy = 0
    y_pred = torch.tensor([], dtype=bool).to(device)
    y_true = torch.tensor([], dtype=bool).to(device)
    
    with torch.no_grad():
        for batch in data_loader:

            inputs, labels = batch
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
#             print('output', outputs.shape)
#             print('labels', labels.shape)
            outputs = outputs.squeeze(1)

            loss = criterion(outputs, labels)
            epoch_loss += loss.item()
            
            preds = (torch.sigmoid(outputs) >= 0.5)
            y_pred = torch.cat([y_pred, preds])
            y_true = torch.cat([y_true, labels])
            
#             accuracy += (preds == labels).sum()
#             n_sample += len(labels)
    
    epoch_loss = epoch_loss / len(data_loader)
    # accuracy = accuracy.cpu().item()
    # accuracy = accuracy / n_sample
    return epoch_loss, y_true.cpu().numpy(), y_pred.cpu().numpy()
    

In [46]:
N_EPOCH = 50
for epoch in range(N_EPOCH):
    
    t0 = time.time()
    
    train_loss = train(model, train_loader, optimizer, criterion)
    val_loss, y_true, y_pred = evaluate(model, val_loader, criterion)
    
    duration = (time.time() - t0) / 60
    
    f1 = f1_score(y_true, y_pred)
    print(f'Epoch: {epoch+1} | Time: {duration:.2f} mins | Train Loss: {train_loss:.3f} | Val Loss: {val_loss:.3f} | F1: {f1:.3f}')
    # print(classification_report(y_true, y_pred))


Epoch: 1 | Time: 0.18 mins | Train Loss: 0.097 | Val Loss: 0.108 | F1: 0.963
Epoch: 2 | Time: 0.18 mins | Train Loss: 0.094 | Val Loss: 0.118 | F1: 0.951
Epoch: 3 | Time: 0.17 mins | Train Loss: 0.091 | Val Loss: 0.111 | F1: 0.966
Epoch: 4 | Time: 0.18 mins | Train Loss: 0.091 | Val Loss: 0.107 | F1: 0.965
Epoch: 5 | Time: 0.18 mins | Train Loss: 0.086 | Val Loss: 0.115 | F1: 0.967
Epoch: 6 | Time: 0.18 mins | Train Loss: 0.084 | Val Loss: 0.149 | F1: 0.939
Epoch: 7 | Time: 0.18 mins | Train Loss: 0.080 | Val Loss: 0.111 | F1: 0.961
Epoch: 8 | Time: 0.18 mins | Train Loss: 0.078 | Val Loss: 0.111 | F1: 0.963
Epoch: 9 | Time: 0.18 mins | Train Loss: 0.078 | Val Loss: 0.104 | F1: 0.967
Epoch: 10 | Time: 0.18 mins | Train Loss: 0.072 | Val Loss: 0.094 | F1: 0.971
Epoch: 11 | Time: 0.18 mins | Train Loss: 0.070 | Val Loss: 0.110 | F1: 0.971
Epoch: 12 | Time: 0.18 mins | Train Loss: 0.069 | Val Loss: 0.113 | F1: 0.971
Epoch: 13 | Time: 0.18 mins | Train Loss: 0.066 | Val Loss: 0.098 | F1: 0

In [45]:
torch.save(model.state_dict(), 'models/rnn/rnn_bi_lda.pt')

In [47]:
test_loss, y_true, y_pred = evaluate(model, test_loader, criterion)

In [32]:
train_loss, y_true, y_pred = evaluate(model, train_loader, criterion)

In [33]:
test_loss, train_loss

(0.271284740883857, 0.21309925175072358)

In [48]:
confusion_matrix(y_true, y_pred)

array([[3592,  133],
       [  58,  217]])

In [49]:
print(classification_report(y_true, y_pred, digits=4))

              precision    recall  f1-score   support

         0.0     0.9841    0.9643    0.9741      3725
         1.0     0.6200    0.7891    0.6944       275

    accuracy                         0.9523      4000
   macro avg     0.8021    0.8767    0.8343      4000
weighted avg     0.9591    0.9523    0.9549      4000



In [121]:
sum(y_true == True)

368

In [120]:
len(y_true)

4000

In [99]:
sample.head()

Unnamed: 0,id,category,time,event,cluster
0,3-6265210,autot,32,periodic,1
1,3-7781405,autot,34,periodic,19
2,3-9431922,autot,89,periodic,14
3,3-5058573,autot,44,periodic,19
4,3-9155135,autot,98,periodic,3


In [41]:
def get_metrics(df, clusters, y_pred, maj_cats, target):
    accuracy = 0   # accuracy of detect correctly clusters have the target category as major
    precision = 0  # accuracy of detect how many articles in cluster are target category
    recall = 0    # accuracy of detect how many articles in target category are in choosen clusters
    
    pred_ids = np.argwhere(y_pred > 0).squeeze(1)
    n_cluster = len(pred_ids)

    df['target'] = (df['event'] != 'noise')
    target_cluster = df[df['target'] == True]
    
    for i in pred_ids:
        pred_cat = maj_cats[i]
        cluster = df.iloc[clusters[i]]
        
        if pred_cat == target:
            accuracy += 1
        # calculate the prec and recall
        if len(cluster['target']) > 0:
            precision += cluster['target'].mean()
        recall += cluster['target'].sum()
    
    # if there is output 
    if n_cluster:
        accuracy = accuracy / n_cluster
        precision = precision / n_cluster
    
    recall = recall / target_cluster.shape[0]
    
    return accuracy, precision, recall


def evaluate_pipeline(samples, labels, preds, maj_cats, offset=0):
    n_samples = len(samples)
    acc = np.zeros(n_samples)
    prc = np.zeros(n_samples)
    rec = np.zeros(n_samples)
    f = np.zeros(n_samples)
    n_cluster = 20
    
    for i, sample in enumerate(tqdm(samples)):
        target = get_target(sample)
        y_true = labels[i*n_cluster: (i+1)*n_cluster]
        y_pred = preds[i*n_cluster: (i+1)*n_cluster]
        cats = maj_cats[i*n_cluster: (i+1)*n_cluster]
        
        clusters = pickle.load(open("models/lda_cluster/clusters_" + str(i + offset) + ".pkl", "rb"))
        
        acc[i], prc[i], rec[i] = get_metrics(sample, clusters, y_pred, cats, target)
        if (prc[i] + rec[i]) != 0:
            f[i] = 2 * prc[i] * rec[i] / (prc[i] + rec[i])
        
    return acc, prc, rec, f

In [50]:
accs, precs, recs, f_micro = evaluate_pipeline(test_samples, y_true, y_pred, maj_cats)

100%|██████████| 200/200 [00:03<00:00, 58.59it/s]


In [51]:
f_macro =2 * precs.mean()*recs.mean()/(precs.mean()+recs.mean())
accs.mean(), precs.mean(), recs.mean()

(0.6818333333333334, 0.5236781561546279, 0.43449663001371763)

In [52]:
f_micro = np.nan_to_num(f_micro)
print("f_micro:", f_micro.mean(), "f_macro:", f_macro)

f_micro: 0.4302670723083841 f_macro: 0.4749371353652097


In [44]:
# evaluate stt 

In [46]:
stt_changes[0]

(0,
 'Talous',
 date
 2007-01-01    250
 2007-01-08    261
 2007-01-15    325
 2007-01-22    323
 2007-01-29    368
              ... 
 2008-10-27    345
 2008-11-03    307
 2008-11-10    313
 2008-11-17    304
 2008-11-24    292
 Freq: <pandas._libs.properties.CachedProperty object at 0x7ff1908a8460>, Length: 100, dtype: int64)

In [49]:
def get_stt_changes():
    changes = pickle.load(open("models/stt/stt_changes.pkl", "rb"))
    X = np.zeros((20, 100))
    for i, s in enumerate(changes):
        counts = s[2].values
        X[i] = counts
    
    X = scale_x(X)
    
    return X
        

In [50]:
stt_changes = get_stt_changes()

In [51]:
stt_changes[0]

array([1.        , 1.0438247 , 1.29880478, 1.29083665, 1.47011952,
       1.34262948, 1.1314741 , 1.10756972, 1.60557769, 1.20318725,
       1.63346614, 1.07171315, 1.36653386, 0.92031873, 1.15537849,
       1.01992032, 1.27888446, 1.05976096, 1.03585657, 0.812749  ,
       1.37450199, 1.12350598, 1.20717131, 1.25099602, 0.94422311,
       0.99601594, 1.11553785, 1.25099602, 0.8685259 , 0.93625498,
       1.36653386, 1.07569721, 1.33864542, 1.02390438, 1.03187251,
       0.86055777, 0.92031873, 1.20318725, 1.13545817, 1.19920319,
       1.2749004 , 1.08366534, 1.5059761 , 1.65737052, 0.9561753 ,
       0.98007968, 1.11155378, 1.10756972, 0.94023904, 1.11553785,
       1.59760956, 0.59760956, 0.89641434, 1.20318725, 1.27091633,
       1.64940239, 1.38645418, 1.25896414, 1.18326693, 0.97609562,
       1.1752988 , 1.27888446, 1.187251  , 0.90438247, 0.84462151,
       0.85657371, 1.03984064, 0.9123506 , 1.05976096, 1.03585657,
       1.03585657, 0.96414343, 1.02390438, 1.07569721, 0.97609

In [60]:
def infer(model, inputs):
    model.eval()
    y_pred = torch.tensor([], dtype=bool).to(device)
    
    with torch.no_grad():
        inputs = torch.tensor(inputs, dtype=torch.float).to(device)
        print(inputs.shape)

        outputs = model(inputs)
        outputs = outputs.squeeze(1)
        preds = (torch.sigmoid(outputs) >= 0.5)
        y_pred = torch.cat([y_pred, preds])
            
    return y_pred.cpu().numpy()

In [61]:
stt_pred = infer(model, stt_changes)

torch.Size([20, 100])


In [62]:
stt_pred

array([False, False, False, False, False, False, False,  True, False,
       False,  True,  True, False, False,  True, False, False, False,
        True, False])