Importing necessary libraries and dependencies

In [None]:
import json
import gensim.downloader as api
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

Text Preprocessing

In [None]:
contraction = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not", "don't": "do not", "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would", "he'd've": "he would have", "he'll": "he will",
    "he'll've": "he he will have", "he's": "he is", "how'd": "how did",
    "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
    "I'd": "I would", "I'd've": "I would have", "I'll": "I will",
    "I'll've": "I will have", "I'm": "I am", "I've": "I have",
    "i'd": "i would", "i'd've": "i would have", "i'll": "i will",
    "i'll've": "i will have", "i'm": "i am", "i've": "i have",
    "isn't": "is not", "it'd": "it would", "it'd've": "it would have",
    "it'll": "it will", "it'll've": "it will have", "it's": "it is",
    "let's": "let us", "ma'am": "madam", "mayn't": "may not",
    "might've": "might have", "mightn't": "might not", "mightn't've": "might not have",
    "must've": "must have", "mustn't": "must not", "mustn't've": "must not have",
    "needn't": "need not", "needn't've": "need not have", "o'clock": "of the clock",
    "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
    "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would",
    "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have",
    "she's": "she is", "should've": "should have", "shouldn't": "should not",
    "shouldn't've": "should not have", "so've": "so have", "so's": "so as",
    "this's": "this is",
    "that'd": "that would", "that'd've": "that would have", "that's": "that is",
    "there'd": "there would", "there'd've": "there would have", "there's": "there is",
    "they'd": "they would", "they'd've": "they would have", "they'll": "they will",
    "they'll've": "they will have", "they're": "they are", "they've": "they have",
    "to've": "to have", "wasn't": "was not", "we'd": "we would",
    "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have",
    "we're": "we are", "we've": "we have", "weren't": "were not",
    "what'll": "what will", "what'll've": "what will have", "what're": "what are",
    "what's": "what is", "what've": "what have", "when's": "when is",
    "when've": "when have", "where'd": "where did", "where's": "where is",
    "where've": "where have", "who'll": "who will", "who'll've": "who will have",
    "who's": "who is", "who've": "who have", "why's": "why is",
    "why've": "why have", "will've": "will have", "won't": "will not",
    "won't've": "will not have", "would've": "would have", "wouldn't": "would not",
    "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
    "y'all'd've": "you all would have", "y'all're": "you all are", "y'all've": "you all have",
    "you'd": "you would", "you'd've": "you would have", "you'll": "you will",
    "you'll've": "you will have", "you're": "you are", "you've": "you have"}
    
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
stopw = set(stopwords.words('english'))

def clean(text):
    text = text.lower()
    temp = ""
    for i in text.split():
        try:
            temp+=contraction[i]+' '
        except:
            temp+= i+' '
    text = temp.strip()
    text = text.lower().translate(remove_punctuation_map)
    text = re.sub("[^a-zA-Z#]"," ",text)
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r",", "", text)
    text = re.sub(r"\.", "", text)
    text = re.sub(r"!", "!", text)
    text = re.sub(r"\/", "", text)
    text = re.sub(r"'", "", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", ":", text)
    text = re.sub(r' +',' ',text)
    return text.strip()

def stopwordremoval(text):
    text = word_tokenize(text)
    text = [i for i in text if i not in stopw]
    return " ".join(text)

First test case preparation (Test case 1)

In [None]:
text = """Machine learning is the scientific study of algorithms and statistical models that computer systems use to perform a specific task without using explicit instructions, relying on patterns and inference instead. It is seen as a subset of artificial intelligence. Machine learning algorithms build a mathematical model based on sample data, known as "training data", in order to make predictions or decisions without being explicitly programmed to perform the task. Machine learning algorithms are used in a wide variety of applications, such as email filtering and computer vision, where it is difficult to develop a conventional algorithm for effectively performing the task."""
alt = """ Machine learning is the  study of algorithms and statistical models that computer systems use to perform a specific task without using explicit instructions. It is seen as a subset of artificial intelligence. Machine learning algorithms build a mathematical model based on sample data in order to make predictions or decisions without being explicitly programmed to perform the task. Machine learning algorithms are used in a wide variety of applications, such as email filtering and computer vision, where it is difficult to develop a conventional algorithm for effectively performing the task."""
text2 = word_tokenize(text.lower().translate(remove_punctuation_map))
text2 = [i for i in text2 if i not in stopw]
alt2 = word_tokenize(alt.lower().translate(remove_punctuation_map))
alt2 = [i for i in alt2 if i not in stopw]

Model Building

In [None]:
import gensim, smart_open

def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            print(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])
                
def read_corpus_semeval(tokens_only=False):
    i = 0
    doc = api.load("semeval-2016-2017-task3-subtaskA-unannotated")
    for dictionary in doc:
        sentList = []
        for com in dictionary["RelComments"]:
            sentList.append(word_tokenize(clean(com["RelCText"])))
        sentList.append(word_tokenize(clean(dictionary["RelQuestion"]["RelQBody"])))
        for sent in sentList:
            if tokens_only:
                yield sent
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(sent, [i])
                i += 1

def read_fakenews(tokens_only=False):
    doc = api.load("fake-news")
    i = 0
    for line in doc: 
        dictionary = eval(json.dumps(line))
        q = word_tokenize(clean(dictionary["title"]))
        t = [word_tokenize(clean(i)) for i in sent_tokenize(dictionary["text"])]
        t.append(q)
        for sent in t:
            if tokens_only:
                yield sent
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(sent, [i])
                i += 1

In [None]:
tc1 = read_corpus_semeval()

In [None]:
model = Doc2Vec(vector_size=300, workers=8, epochs=10)
model.build_vocab(tc1)
model.train(tc1, total_examples=model.corpus_count, epochs=model.epochs)



In [None]:
model.save("dtv_semeval")


In [None]:
tc3 = read_fakenews()
model.build_vocab(tc3, update=True)
model.train(tc3, total_examples=len(list(tc3)), epochs=model.epochs)



In [None]:
model.save("dtv_semeval_fn")


Model performance

In [None]:
model.wv.n_similarity(alt2, text2)


0.9592219

Test Case 2
(What is statistics?)

In [None]:
text = """numbers that have been collected in order to provide information about something."""
alt = """ the science of collecting and studying these numbers."""
text2 = word_tokenize(text.lower().translate(remove_punctuation_map))
text2 = [i for i in text2 if i not in stopw]
alt2 = word_tokenize(alt.lower().translate(remove_punctuation_map))
alt2 = [i for i in alt2 if i not in stopw]

In [None]:
model.wv.n_similarity(alt2, text2)


0.10803401

Test Case 3
(What is optimization?)


In [None]:
text = """Optimization is the process where we train the model repeatedly that results in a maximum and minimum function evaluation."""
alt = """ An optimization problem consists of maximizing or minimizing a real function by systematically choosing input values from an allowed set and computing the value of the function."""
text2 = word_tokenize(text.lower().translate(remove_punctuation_map))
text2 = [i for i in text2 if i not in stopw]
alt2 = word_tokenize(alt.lower().translate(remove_punctuation_map))
alt2 = [i for i in alt2 if i not in stopw]
model.wv.n_similarity(alt2, text2)  

0.28521207

Test Case 4
(What is programming language?)

In [None]:
text = """A programming language is a computer language that is used by programmers (developers) to communicate with computers"""
alt = """ A programming language is a vocabulary and set of grammatical rules for instructing a computer or computing device to perform specific tasks."""
text2 = word_tokenize(text.lower().translate(remove_punctuation_map))
text2 = [i for i in text2 if i not in stopw]
alt2 = word_tokenize(alt.lower().translate(remove_punctuation_map))
alt2 = [i for i in alt2 if i not in stopw]
model.wv.n_similarity(alt2, text2)  

0.3567021

Test Case 5
(What is the mean of data?)

In [None]:
text = """The mean (average) of a data set is found by adding all numbers in the data set and then dividing by the number of values in the set."""
alt = """In statistics, the mean for a given set of observations is equal to the sum of all the values of a collection of data divided by the total number of values in the set."""
text2 = word_tokenize(text.lower().translate(remove_punctuation_map))
text2 = [i for i in text2 if i not in stopw]
alt2 = word_tokenize(alt.lower().translate(remove_punctuation_map))
alt2 = [i for i in alt2 if i not in stopw]
model.wv.n_similarity(alt2, text2)  

0.63534915

Test Case 6
(What is data science?)

In [None]:
text = """Data science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems to extract knowledge and insights from noisy, structured and unstructured data, and apply knowledge from data across a broad range of application domains."""
alt = """Data science is the field of study that combines domain expertise, programming skills, and knowledge of mathematics and statistics to extract meaningful insights from data."""
text2 = word_tokenize(text.lower().translate(remove_punctuation_map))
text2 = [i for i in text2 if i not in stopw]
alt2 = word_tokenize(alt.lower().translate(remove_punctuation_map))
alt2 = [i for i in alt2 if i not in stopw]
model.wv.n_similarity(alt2, text2)  

0.48388988

Test Case 7
(What is linear regression?)

In [None]:
text = """Linear regression analysis is used to predict the value of a variable based on the value of another variable. The variable you want to predict is called the dependent variable. The variable you are using to predict the other variable's value is called the independent variable."""
alt = """Linear regression is the estimation of a continuous dependent variable or response from a list of input variables, or features."""
text2 = word_tokenize(text.lower().translate(remove_punctuation_map))
text2 = [i for i in text2 if i not in stopw]
alt2 = word_tokenize(alt.lower().translate(remove_punctuation_map))
alt2 = [i for i in alt2 if i not in stopw]
model.wv.n_similarity(alt2, text2)  

0.2637313

Test Case 8
(What is neural network in AI?)

In [None]:
text = """A neural network is a method in artificial intelligence that teaches computers to process data in a way that is inspired by the human brain. It is a type of machine learning process, called deep learning, that uses interconnected nodes or neurons in a layered structure that resembles the human brain."""
alt = """Neural networks, also known as artificial neural networks or simulated neural networks, are a subset of machine learning and are at the heart of deep learning algorithms. Their name and structure are inspired by the human brain, mimicking the way that biological neurons signal to one another."""
text2 = word_tokenize(text.lower().translate(remove_punctuation_map))
text2 = [i for i in text2 if i not in stopw]
alt2 = word_tokenize(alt.lower().translate(remove_punctuation_map))
alt2 = [i for i in alt2 if i not in stopw]
model.wv.n_similarity(alt2, text2)  

0.46328947

Test Case 9
(What is deep learning?)

In [None]:
text = """Deep learning is a type of machine learning based on artificial neural networks in which multiple layers of processing are used to extract progressively higher level features from data."""
alt = """Deep learning is part of a broader family of machine learning methods based on artificial neural networks with representation learning."""
text2 = word_tokenize(text.lower().translate(remove_punctuation_map))
text2 = [i for i in text2 if i not in stopw]
alt2 = word_tokenize(alt.lower().translate(remove_punctuation_map))
alt2 = [i for i in alt2 if i not in stopw]
model.wv.n_similarity(alt2, text2)  

0.56300473

Test Case 10
(What is median in data science?)

In [None]:
text = """The median is the middle number in a sorted, ascending or descending, list of numbers."""
alt = """The median is the middle value when a data set is ordered from least to greatest."""
text2 = word_tokenize(text.lower().translate(remove_punctuation_map))
text2 = [i for i in text2 if i not in stopw]
alt2 = word_tokenize(alt.lower().translate(remove_punctuation_map))
alt2 = [i for i in alt2 if i not in stopw]
model.wv.n_similarity(alt2, text2)  

0.31520677