In [1]:

#necessary packages import
import pandas as pd
import numpy as np
import re
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
import nltk
import gensim
from gensim import corpora, models, similarities
from scipy import sparse

In [2]:
#reading the excel file containing the dataset
df = pd.read_csv("../bible_data_set/bible_data_set.csv")
new = ['Matthew', 'Mark', 'Luke', 'John', 'Acts', 'Romans', '1 Corinthians',
            '2 Corinthians', 'Galatians', 'Ephesians', 'Philippians', 'Colossians',
            '1 Thessalonians', '2 Thessalonians', '1 Timothy', '2 Timothy', 'Titus', 'Philemon',
            'Hebrews', 'James', '1 Peter', '2 Peter', '1 John', '2 John', '3 John', 'Jude',
            'Revelation']
df['class'] = np.where(df['book'].isin(new), 1, 0)
df = df[['class','text']]
df.columns = ['label', 'data']

In [3]:
#data cleansing
def cleansing(x):
    x = x.lower()
    x = re.sub(r"http\S+", "", x)
    x = re.sub(r'[^\w\s]', '', x)
    x = " ".join(filter(lambda x: x[0] != '@' and x[0]!= '#', x.split()))
    x = " ".join(filter(lambda x: x.isalpha() , x.split()))
    x = re.sub(' +', ' ', x)
    return x.strip()
df['data'] = df['data'].apply(lambda x: cleansing(x))

In [4]:
#separate the feature set and label
df_x = df['data']
df_y = df['label']

In [5]:
#word2vec
x=df['data'].values.tolist()
corpus = x
tok_corp= [nltk.word_tokenize(sent) for sent in corpus]
model = gensim.models.Word2Vec(tok_corp, min_count=1, size = 32, seed = 45)

In [6]:
#converting sentence into vectors
df2 = pd.DataFrame(columns=['vec'])
for i,v in df_x.iteritems():
    arr = np.zeros((32,))
    for val in tok_corp[i]:
        arr = arr + model[val]
    df2 = df2.append({'vector': arr}, ignore_index=True)

  


In [7]:
df_z = df2['vector']

In [8]:
#using TFIDF vectorizer for word to vector conversion
cv=TfidfVectorizer(min_df=1,stop_words='english')

In [9]:
print(df2.iloc[0])

vec                                                     NaN
vector    [-1.4965026900172234, -1.8841276578605175, -7....
Name: 0, dtype: object


In [10]:
#splitting the training and testing set
x_train,x_test,y_train,y_test=train_test_split(df_z,df_y,test_size=0.30,random_state=45)

In [11]:
#scaling the feature vectors for optimization
# x_traincv = cv.fit_transform(x_train)
# x_testcv = cv.transform(x_test)
# y_train=y_train.astype('int')
# print((x_testcv.shape))
l1 = []
l2 = []
for iter1 in x_train:
    l1.append(iter1)
for iter2 in x_test:
    l2.append(iter2)
x_traincv = sparse.csr_matrix(l1)
x_testcv = sparse.csr_matrix(l2)
y_train=y_train.astype('int')

In [12]:
#defining the model
svm_clf=svm.SVC(kernel='linear', C = 1.0)
#svm_clf = MultinomialNB()

In [13]:
#loading the data to the model with the label and training
def training():
    global x_traincv,y_train,svm_clf
    svm_clf.fit(x_traincv,y_train)

In [14]:
#accuracy prediction
def testing():
    global svm_clf
    clf1 = open('model/svm.pickle','rb')
    svm_clf = pickle.load(clf1)
    pred=svm_clf.predict(x_testcv)
    actual=np.array(y_test)
    count_0=0
    count_1=0
    for i in range (len(pred)):
        if int(pred[i]) == int(actual.item(i)) and int(actual.item(i)) == 0:
          count_0=count_0+1
        if int(pred[i]) == int(actual.item(i)) and int(actual.item(i)) == 1:
          count_1=count_1+1
    print("The number of samples in the test set is",len(pred))
    print("The accuracy on the test set is---> ")      
    print("Accuracy-->",(count_1 + count_0)*100/len(pred))

In [None]:
#save the model using pickle
def model_saving():
    global svm_clf
    clf = open('model/svm.pickle','wb')
    pickle.dump(svm_clf,clf)
    clf.close()

In [None]:
training()
model_saving()
testing()

In [None]:
#calculating the cosine similarity
print("The cosine distance between two words-->",model.similarity('light', 'dark'))