In [1]:
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from gensim.test.utils import get_tmpfile
from gensim.models.doc2vec import TaggedDocument

# numpy
import numpy as np

# classifier
from sklearn.linear_model import LogisticRegression

# random
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
from time import time
import re

In [3]:
train_json=r'E:\iiit\Sem3\Information Extraction And Retrieval\major_project\data\json\train_data.json'
val_json=r'E:\iiit\Sem3\Information Extraction And Retrieval\major_project\data\json\val_data.json'

In [4]:
class LabeledLineSentence():
    def __init__(self,fileName):
        self.fileName = fileName
        
    def __iter__(self):
        df = pd.read_json(self.fileName,orient='table')
        text = df['clean_text'].values
        for idx, doc in tqdm(enumerate(text)):
            doc = self.preprocess(doc)
            yield TaggedDocument(words=doc.split(),tags=[idx])
            
    def preprocess(self, doc):
        doc = re.sub('[^a-z]',' ',doc.lower())
        return doc

In [5]:
iterator=LabeledLineSentence(train_json)

In [6]:
model = Doc2Vec(iterator,min_count=1, vector_size=250, sample=1e-4, negative=6 ,workers=4,epochs=2)

600000it [02:43, 3675.93it/s]
600000it [06:45, 1477.90it/s]
600000it [06:14, 1603.37it/s]


In [7]:
model.train(iterator, total_examples=model.corpus_count, epochs=5)

600000it [06:08, 1629.29it/s]
600000it [06:16, 1595.66it/s]
600000it [06:14, 1601.62it/s]
600000it [06:30, 1534.96it/s]
600000it [06:28, 1544.69it/s]


In [8]:
model.save('doc_2_vec_large.d2v')

# load doc2vec model and transform the data

In [4]:
doc2vec_model=Doc2Vec.load('doc_2_vec_large.d2v')

In [5]:
df_train=pd.read_json(train_json,orient='table')

In [6]:
train_data=[doc2vec_model.infer_vector(sentence.split()) for sentence in tqdm(df_train['clean_text'].values)]

100%|█████████████████████████████████████████████████████████| 600000/600000 [48:09<00:00, 207.66it/s]


In [7]:
df_val=pd.read_json(val_json,orient='table')

In [8]:
val_data=[doc2vec_model.infer_vector(sentence.split()) for sentence in tqdm(df_val['clean_text'].values)]

100%|█████████████████████████████████████████████████████████| 150000/150000 [13:57<00:00, 179.11it/s]


In [13]:
np.save('d2v_train.npy',train_data)
np.save('d2v_test.npy',val_data)

# training

In [1]:
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
from time import time
import re

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.layers import LSTM,GlobalMaxPooling1D,Bidirectional
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error

In [2]:
x=np.load('d2v_train.npy')

In [3]:
x_test=np.load('d2v_test.npy')

In [4]:
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
labels=np.load('hyperpartisan_large.npy',allow_pickle=True)
le=LabelEncoder()
le_labels=le.fit_transform(labels)

In [5]:
y_test=np.load('hyperpartisan_small.npy',allow_pickle=True)

In [6]:
model = Sequential()
model.add(Bidirectional(LSTM(150, return_sequences=True, dropout=0.1, recurrent_dropout=0.0)))

model.add(Conv1D(filters=64, kernel_size=8, padding='same', activation='relu', 
                 input_shape=(250,1),kernel_initializer= 'glorot_uniform'))
model.add(MaxPooling1D(pool_size=4))
model.add(Dropout(0.4))

model.add(Conv1D(filters=32, kernel_size=8, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(Dropout(0.4))

model.add(Conv1D(filters=32, kernel_size=8, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(Dropout(0.4))

model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=3))
model.add(Dropout(0.4))

model.add(Dense(64,activation='relu'))
model.add(Dropout(0.2))


model.add(Dense(32,activation='relu'))
model.add(Dropout(0.1))

model.add(Dense(16,activation='relu'))
model.add(Dense(2))
model.add(Activation('sigmoid'))
model.compile(optimizer=Adam(learning_rate=1e-4),loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [1]:
# .reshape(600000,250,1)

#X_train.reshape((train_size,classification_dim,1))

In [8]:
batch_size=128

model.fit(x_test.reshape(150000,250,1),le.transform(y_test),validation_split=0.1,shuffle=True, epochs=10,batch_size=batch_size)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x226ff7f4c88>

In [9]:
model.evaluate(x.reshape(600000,250,1),le_labels)



[0.7351899743080139, 0.519361674785614]

In [4]:
from sklearn.ensemble import RandomForestClassifier

In [6]:
clf = RandomForestClassifier(n_estimators=400,max_depth=90,n_jobs=-1,verbose=1)

In [7]:
clf.fit(x,le_labels)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 400 out of 400 | elapsed: 17.4min finished


RandomForestClassifier(max_depth=90, n_estimators=400, n_jobs=-1, verbose=1)

In [9]:
x_test=np.load('d2v_test.npy')

In [10]:
preds=clf.predict(x_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    3.1s
[Parallel(n_jobs=8)]: Done 400 out of 400 | elapsed:    6.8s finished


In [11]:
actuals=np.load('hyperpartisan_small.npy',allow_pickle=True)

In [12]:
from sklearn.metrics import accuracy_score,f1_score

In [16]:
f1_score(le.transform(actuals),preds)

0.6439221067973042

In [17]:
accuracy_score(le.transform(actuals),preds)

0.5537133333333333