In [3]:
import numpy as np
import pandas as pd

In [4]:
# loading the processed_data from txt file
input_data = np.loadtxt('in_train.txt', dtype=float)
output_data = np.loadtxt('out_train.txt', dtype=float)

In [5]:
in_data = input_data[:, :5]
print(in_data.shape)

(53447, 5)


In [6]:
out_data = output_data.argmax(axis=1).reshape(-1,1)
print(out_data.shape)

(53447, 1)


In [7]:
# shuffling the data 
final_data = np.concatenate((in_data, out_data), axis = 1)  # merging the input n expected output
print(final_data.shape)
np.random.shuffle(final_data)  # shuffling the data

(53447, 6)


In [8]:
X = final_data[:, :5]
Y = final_data[:, -1:]

In [9]:
Y.shape

(53447, 1)

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [12]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(35809, 5)
(17638, 5)
(35809, 1)
(17638, 1)


In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

clf = DecisionTreeClassifier(max_depth=None, min_samples_split=2,random_state=0)
clf_rand = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0)
clf = clf.fit(X_train, y_train)
clf_rand = clf_rand.fit(X_train, y_train)

  import sys


In [14]:
y_pred = clf.predict(X_test)
y_pred_rand = clf.predict(X_test)

In [15]:
print(y_pred, y_pred_rand)

[7. 6. 4. ... 3. 4. 7.] [7. 6. 4. ... 3. 4. 7.]


In [16]:
from sklearn.metrics import confusion_matrix

In [17]:
confusion_matrix(y_test.argmax(axis=1), y_pred)

array([[ 939, 1559, 1481,  430, 2549, 5850, 1619, 2649,  562],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0]],
      dtype=int64)

In [18]:
from sklearn.metrics import f1_score

In [19]:
print(f1_score(y_test, y_pred, average='macro'))
print(f1_score(y_test, y_pred_rand, average='macro'))

0.9754224751061807
0.9754224751061807


In [20]:
testing_data = pd.read_csv('test_nvPHrOx.csv')

In [21]:
url_data= testing_data['Url']

In [22]:
import re
def clean_list(input):
    output = []
    for i in input:
        text = re.sub(r"http://", "", i)
        text = re.sub(r"https://", "", text)
        #print(output)
        output.append(text)
    return output

In [23]:
url_data_pro = clean_list(url_data)

In [24]:
url_data_pro[:4]

['www.isrctn.com/ISRCTN57801413',
 'www.clinicaltrialsregister.eu/ctr-search/trial/2006-006214-16/GB',
 'www.clinicaltrialsregister.eu/ctr-search/trial/2006-004265-34/LT',
 'www.clinicaltrialsregister.eu/ctr-search/trial/2010-022183-12/IT']

In [25]:
from keras.preprocessing import text, sequence 

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [26]:
def createVocab(field):
    #not_found = 'nan'
    vocab = []
    for line in field:
        for i in line.split('/'):
            if i not in vocab:
                vocab.append(i)
    return vocab 
 
# generating seq IDs for words in vocabulary
def genSeqNo(vocabulary):
    i=1.0
    vocab_seq = {}
    for word in vocabulary:
        if word not in vocab_seq:
            vocab_seq[word] = i
            i = i+1
    return vocab_seq

# function to replace words with its uniq seq ID
def word2Seq(data):
    result_seq = []
    #max_seq = list(t_vocab_seq.items())[-1][1]
    seq_len = []
    for line in data:        
        seq = []
        curr_len = 0
        for word in line.split('/'):
            curr_len = curr_len + 1
            if word in t_vocab_seq:
                seq.append(t_vocab_seq[word])
            else:
                seq.append(0)
        result_seq.append(seq)    # to append seqID of line
        seq_len.append(curr_len)  # to find and append length of line
    return result_seq, seq_len

def seqNormalize(seq, data):
    norm_seq = []
    max_seq = max(seq.values())
    norm_seq[:] = [[ele / max_seq for ele in sub] for sub in data]
    return norm_seq


In [27]:
url_vocab = createVocab(url_data_pro)   # creating vocabulary 
t_vocab_seq = genSeqNo(url_vocab)       # generating seqID for each word in vocab
url_seq, seqn_len = word2Seq(url_data_pro)    # replacing word with seqID and a list of each line_length
url_seq_norm = seqNormalize(t_vocab_seq, url_seq)    # normalizing the seqn IDs between 0 n 1

In [28]:
url_seq[:4]

[[1.0, 2.0],
 [3.0, 4.0, 5.0, 6.0, 7.0],
 [3.0, 4.0, 5.0, 8.0, 9.0],
 [3.0, 4.0, 5.0, 10.0, 11.0]]

In [2]:
import matplotlib.pyplot as plt
# plotting frequency distribution of SeqnIDs
plt.hist([url_seq])
plt.xticks(range(15))
plt.show()
# as observed from histogram, taking max_seqn_len as 10

NameError: name 'url_seq' is not defined

In [None]:
# padding seqn to max_seqn_length
#url_seq_pad = sequence.pad_sequences(url_seq_norm, maxlen = max_seq_len, dtype='float32', padding='post')
url_seq_pad = sequence.pad_sequences(url_seq, maxlen = 3, dtype='float32', padding='post')

In [39]:
pred_testing = clf.predict(url_seq_pad)

In [49]:
pred_testing[:4]
print(pred_testing.shape)
test_prediction = np.array(pred_testing, dtype='int64')

(25787,)


In [40]:
# training data
input_data = pd.read_csv('train.csv')
label = np.array(input_data['Tag'])

# one-hot encoding via sklearn
#https://machinelearningmastery.com/how-to-one-hot-encode-sequence-data-in-python/
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(label)

In [50]:
label_decoded = np.array(label_encoder.inverse_transform(test_prediction))

  if diff:


In [None]:
print(label_decoded[:4])
#print(label_decodede)

In [51]:
webid = np.array(testing_data['Webpage_id'])

In [52]:
print(webid.shape)
print(label_decoded.shape)
w = webid.reshape(-1,1)
l = label_decoded.reshape(-1,1)

(25787,)
(25787,)


In [53]:
final_output = np.concatenate((w, l),axis = 1 )

In [54]:
final_output[:4]

array([[31, 'news'],
       [32, 'news'],
       [33, 'news'],
       [34, 'news']], dtype=object)

In [55]:
np.savetxt('final_output.txt', final_output, fmt='%s')

In [29]:
m = [13,1,5]
n = [1,6,3,5,4]

x = [a + b for a, b in zip(m, n)]

In [30]:
print(x)

[14, 7, 8]


In [31]:
s1 = ""
s2 = "testing"
s3 = "123"
result = s1 or s2 or s3

In [32]:
print(result)

testing
