In [0]:
from google.colab import drive
import numpy as np
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
from gensim.test.utils import datapath
from gensim import utils
import os

def getData(path):
  data=[]
  labels = []
  directory = os.fsencode(path)

  for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".deft"): 
      
      filename = path + filename  
      with open(filename) as fp:
        line = fp.readline()
        
        while line:
          l=line.strip()
          labels.append(int(l[len(l)-2]))
          
          line = fp.readline()
          if(len(utils.simple_preprocess(l))==0):
            data.append(utils.simple_preprocess('hamada'))
          else:
            data.append(utils.simple_preprocess(l))
  return data, labels
  
def getDataAsString(path):
  test=[]
  labels = []
  directory = os.fsencode(path)

  for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".deft"): 
      
      filename= path + filename  
      with open(filename) as fp:
        line = fp.readline()
        while line: 
          l=line.strip()
          labels.append(int(l[len(l)-2]))
          test.append(l[1:-3].strip())

          line = fp.readline()
  return test, labels

def getSentenceMeans(corpus, word2VecModel):
  means=[]
  for sentence in corpus:
    mean=np.zeros(100)

    for word in sentence:
      try:
        mean[0:100] += word2VecModel.wv[word]
      except KeyError:
        mean = mean
    
    
    mean = mean / len(sentence)
    means.append(mean)
  return means


In [0]:
import gensim.models


data, labels = getData('drive/My Drive/deft_train/')
print(len(data))

word2VecModel = gensim.models.Word2Vec(sentences=data, min_count=2, size=100)
print("Vocab Length:", len(word2VecModel.wv.vocab))

sentencesVecs = getSentenceMeans(data, word2VecModel)
print("Training Data Shape: ",np.array(sentencesVecs).shape)



testData, testLabels = getData('drive/My Drive/deft_test/')
sentencesVecsTest = getSentenceMeans(testData, word2VecModel)
print("Test Data Shape: ",np.array(sentencesVecsTest).shape)

18157
Vocab Length: 13687
Training Data Shape:  (18157, 100)
Test Data Shape:  (853, 100)


In [0]:
from sklearn.metrics import f1_score
target=['no','yes']

def predictModel(model, X_train, y_train, X_test, y_test):
  model.fit(X_train, y_train) 
  prediction = model.score(X_test, y_test)

  labelsPredicted = model.predict(X_test)
  f1Score = f1_score(y_test, labelsPredicted)

  print("Model", type(model).__name__)
  print("Accuracy: ", prediction)
  print(classification_report(y_test, labelsPredicted, target_names=target))
  print()
  

In [0]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score,classification_report


neigh = KNeighborsClassifier(n_neighbors=5)
predictModel(neigh, sentencesVecs, labels, sentencesVecsTest, testLabels)

tree = DecisionTreeClassifier(criterion = "entropy", splitter = "random")
predictModel(tree, sentencesVecs, labels, sentencesVecsTest, testLabels)

LR = LogisticRegression(random_state=0, solver='sag')
predictModel(LR, sentencesVecs, labels, sentencesVecsTest, testLabels)


Model KNeighborsClassifier
Accuracy:  0.6881594372801876
              precision    recall  f1-score   support

          no       0.76      0.78      0.77       573
         yes       0.53      0.50      0.51       280

    accuracy                           0.69       853
   macro avg       0.64      0.64      0.64       853
weighted avg       0.68      0.69      0.69       853


Model DecisionTreeClassifier
Accuracy:  0.6858147713950762
              precision    recall  f1-score   support

          no       0.77      0.76      0.77       573
         yes       0.52      0.53      0.52       280

    accuracy                           0.69       853
   macro avg       0.64      0.65      0.65       853
weighted avg       0.69      0.69      0.69       853


Model LogisticRegression
Accuracy:  0.7010550996483002
              precision    recall  f1-score   support

          no       0.71      0.95      0.81       573
         yes       0.66      0.19      0.29       280

    accur

In [0]:
def zeroPad(data, encodedArray, max):
  zerosArray=np.zeros((len(data), max, 100))
  for i in range(len(data)):
    zerosArray[i][0:len(encodedArray[i])] += np.array(encodedArray[i])
  return zerosArray


def convertDataAndPad():
  encodedTrain = []
  max = 0
  for sent in data:
    encodedSentence = []

    for word in sent:
      try:
        encodedWord= word2VecModel.wv[word]
      except KeyError:
        s=0
      encodedSentence.append(encodedWord)
    if len(sent) > max:
      max = len(sent)
    encodedTrain.append(encodedSentence)
  docsTrain = zeroPad(data, encodedTrain, max)
  print(docsTrain.shape)
  print("max", max)

  encodedTest = []
  for sent in testData:
    encodedSentence = []
    for word in sent:
      try:
        encodedWord= word2VecModel.wv[word]
      except KeyError:
        s=0
      encodedSentence.append(encodedWord)
    if len(sent) > max:
      max = len(sent)
    encodedTest.append(encodedSentence)

  docsTest = zeroPad(testData, encodedTest, max)


  return docsTrain, docsTest, max

docsTrain, docsTest, max = convertDataAndPad()

n_features = max * 100
docsTrain = docsTrain.reshape(len(docsTrain), n_features)
docsTest = docsTest.reshape(len(docsTest), n_features)


tree = DecisionTreeClassifier(criterion = "entropy", splitter = "random")
predictModel(tree, docsTrain, labels, docsTest, testLabels)

LR = LogisticRegression(random_state=0, solver='sag')
predictModel(LR, docsTrain, labels, docsTest, testLabels)

(18157, 90, 100)
max 90
Model DecisionTreeClassifier
Accuracy:  0.6975381008206331
              precision    recall  f1-score   support

          no       0.77      0.78      0.78       573
         yes       0.54      0.53      0.53       280

    accuracy                           0.70       853
   macro avg       0.66      0.65      0.65       853
weighted avg       0.70      0.70      0.70       853


Model LogisticRegression
Accuracy:  0.7409144196951934
              precision    recall  f1-score   support

          no       0.77      0.88      0.82       573
         yes       0.65      0.45      0.53       280

    accuracy                           0.74       853
   macro avg       0.71      0.67      0.68       853
weighted avg       0.73      0.74      0.73       853






In [0]:
def classifyUsingIndependantFeatures():
  
  from sklearn.feature_extraction.text import CountVectorizer
  from sklearn.naive_bayes import MultinomialNB

  dataString, labels = getDataAsString('drive/My Drive/deft_train/')
  testData, testLabels = getDataAsString('drive/My Drive/deft_test/')

  vectorizer = CountVectorizer()
  vector = vectorizer.fit_transform(dataString)
  print(vectorizer.vocabulary_)
  print(vector.shape)
  # print(type(vector))
  # print(vector)
  # print(vector.toarray())

  testDataVectorized = vectorizer.transform(testData)

  NB = MultinomialNB().fit(vector, labels)
  prediction = NB.score(testDataVectorized, testLabels)
  labelsPredicted = NB.predict(testDataVectorized)
  f1Score = f1_score(testLabels, labelsPredicted)

  print("Naive Bayes")
  print("Accuracy: ", prediction)
  print(classification_report(testLabels, labelsPredicted, target_names=target))

  tree = DecisionTreeClassifier(criterion = "entropy", splitter = "random")
  predictModel(tree, vector, labels, testDataVectorized, testLabels)


In [0]:
classifyUsingIndependantFeatures()

(18157, 27234)
Naive Bayes
Accuracy:  0.753810082063306
              precision    recall  f1-score   support

          no       0.81      0.83      0.82       573
         yes       0.63      0.60      0.61       280

    accuracy                           0.75       853
   macro avg       0.72      0.71      0.72       853
weighted avg       0.75      0.75      0.75       853

Model DecisionTreeClassifier
Accuracy:  0.7491207502930832
              precision    recall  f1-score   support

          no       0.80      0.84      0.82       573
         yes       0.63      0.56      0.59       280

    accuracy                           0.75       853
   macro avg       0.71      0.70      0.71       853
weighted avg       0.74      0.75      0.74       853




In [0]:

def splitLabels(trainLabels,trainDocs):
  positive=[]
  negitive=[]
  for i in range(len(trainLabels)):
    if(trainLabels[i]==1):
      positive.append(trainDocs[i])
    else:
      negitive.append(trainDocs[i])

  return positive,negitive

In [0]:
def generateNegitiveMean(n):
  negitiveVec=[]
  for sen in n:
    sentance=np.zeros(100)
    for word in sen:
      try:
        sentance=sentance+(word2VecModel.wv[word])
      except KeyError:
        s=0
   
    
    sentance=sentance/len(sen)
    negitiveVec.append(sentance)
  
  mean=np.zeros(100)  
  for sen in negitiveVec:
    mean+=sen
  mean=mean/len(negitiveVec)


  return mean

In [0]:
def generatePositveMean(positive):
  positiveVec=[]
  for sen in positive:
    sentance=np.zeros(100)
    for word in sen:
      try:
        sentance=sentance+(word2VecModel.wv[word])
      except KeyError:
        ps=0
    sentance=sentance/len(sen)
    positiveVec.append(sentance)

  mean=np.zeros(100)
  for sen in positiveVec:
    mean+=sen
  mean=mean/len(positiveVec)
  return mean

In [0]:
from scipy import spatial
def nearestMean(trainDocs,trainLabels,testDocs,testLabels):
  posData,negData=splitLabels(trainLabels,trainDocs)
 
  posMean=generatePositveMean(posData)
  negMean=generateNegitiveMean(negData)

  pred=[]
  tdata= getSentenceMeans(testDocs, word2VecModel)

  for tsen in tdata:
    pos=1 - spatial.distance.cosine(tsen, posMean)
    neg=1 - spatial.distance.cosine(tsen, negMean)
    if(pos>neg):
      pred.append(1)
    else:
      pred.append(0)
  print(classification_report(testLabels,pred,target_names=target))
  


In [0]:
data, labels = getData('drive/My Drive/deft_train/')
testData, testLabels = getData('drive/My Drive/deft_test/')

nearestMean(data, labels, testData, testLabels)

              precision    recall  f1-score   support

          no       0.77      0.68      0.72       573
         yes       0.47      0.58      0.52       280

    accuracy                           0.65       853
   macro avg       0.62      0.63      0.62       853
weighted avg       0.67      0.65      0.65       853



  dist = 1.0 - uv / np.sqrt(uu * vv)
