In [165]:
!pip install pyspark



In [166]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [135]:
from __future__ import print_function
from pyspark import SparkConf, SparkContext
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql import SparkSession
from pyspark.ml.feature import NGram
from pyspark.ml.feature import Word2Vec

In [136]:
# creating spark session
spark = SparkSession.builder.appName("TfIdf Example").getOrCreate()

In [138]:
# creating spark dataframe wiht the input data. You can also read the data from file. label represents the 3 documnets (0.0,0.1,0.2)
sentenceData = spark.createDataFrame([
        (0.0, "Welcome to KDM TF_IDF Tutorial."),
        (0.1, "Learn Spark ml tf_idf in today's lab."),
        (0.2, "Spark Mllib has TF-IDF.")
    ], ["label", "sentence"])

In [139]:
# creating tokens/words from the sentence data
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

In [140]:
wordsData.show()

+-----+--------------------+--------------------+
|label|            sentence|               words|
+-----+--------------------+--------------------+
|  0.0|Welcome to KDM TF...|[welcome, to, kdm...|
|  0.1|Learn Spark ml tf...|[learn, spark, ml...|
|  0.2|Spark Mllib has T...|[spark, mllib, ha...|
+-----+--------------------+--------------------+



In [142]:
# applying tf on the words data
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors

In [143]:
# calculating the IDF
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

In [144]:
#displaying the results
rescaledData.select("label", "features").show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(20,[2,8,13,15,17...|
|  0.1|(20,[2,3,6,7],[0....|
|  0.2|(20,[6,14,15],[0....|
+-----+--------------------+



In [146]:
spark2 = SparkSession.builder.appName("Ngram Example").getOrCreate()

In [147]:
#creating dataframe of input
wordDataFrame = spark2.createDataFrame([
    (0, ["Hi", "I", "heard", "about", "Spark"]),
    (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
    (2, ["Logistic", "regression", "models", "are", "neat"])
], ["id", "words"])


In [148]:
 #creating NGrams with n=2 (two words)
ngram = NGram(n=2, inputCol="words", outputCol="ngrams")
ngramDataFrame = ngram.transform(wordDataFrame)

In [149]:
# displaying the results
ngramDataFrame.select("ngrams").show(truncate=False)

+------------------------------------------------------------------+
|ngrams                                                            |
+------------------------------------------------------------------+
|[Hi I, I heard, heard about, about Spark]                         |
|[I wish, wish Java, Java could, could use, use case, case classes]|
|[Logistic regression, regression models, models are, are neat]    |
+------------------------------------------------------------------+



In [150]:
# creating spark session
spark3 = SparkSession.builder.appName("Word2Vec Example").getOrCreate()

In [151]:
# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark3.createDataFrame([
    ("McCarthy was asked to analyse the data from the first phase of trials of the vaccine.".split(" "), ),
    ("We have amassed the raw data and are about to begin analysing it.".split(" "), ),
    ("Without more data we cannot make a meaningful comparison of the two systems.".split(" "), ),
    ("Collecting data is a painfully slow process.".split(" "), ),
    ("You need a long series of data to be able to discern such a trend.".split(" "), )
], ["text"])

In [152]:
# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
model = word2Vec.fit(documentDF)
result = model.transform(documentDF)

In [153]:
for row in result.collect():
    text, vector = row
    #printing the results
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))

Text: [McCarthy, was, asked, to, analyse, the, data, from, the, first, phase, of, trials, of, the, vaccine.] => 
Vector: [0.018829423002898693,0.06531135074328631,0.003471505537163466]

Text: [We, have, amassed, the, raw, data, and, are, about, to, begin, analysing, it.] => 
Vector: [0.05511647720749562,-0.023913293217237182,-0.018586064617221173]

Text: [Without, more, data, we, cannot, make, a, meaningful, comparison, of, the, two, systems.] => 
Vector: [0.012577339433706725,-0.000606541306926654,-0.03402049072946493]

Text: [Collecting, data, is, a, painfully, slow, process.] => 
Vector: [0.01802473116133894,-0.04375123764787401,-0.1021000247980867]

Text: [You, need, a, long, series, of, data, to, be, able, to, discern, such, a, trend.] => 
Vector: [0.017771879273156325,-0.020484041919310886,-0.0020535687605539956]



In [154]:
# showing the synonyms and cosine similarity of the word in input data
synonyms = model.findSynonyms("data", 5)   # its okay for certain words , real bad for others
synonyms.show(5)

+----------+------------------+
|      word|        similarity|
+----------+------------------+
|  systems.| 0.966442346572876|
|      slow|  0.87045818567276|
|   amassed|0.8254424929618835|
|         a|0.8249895572662354|
|comparison| 0.816560685634613|
+----------+------------------+



In [155]:
#closing the spark sessions
spark.stop()
spark2.stop()
spark3.stop()





# **Creating 5 separate text files containing text data (blogs,news articles etc)**






In [156]:
with open("/content/articles/news1.txt","r+") as t1:
    doc1 = t1.read()
with open("/content/articles/news2.txt","r+") as t2:
    doc2 = t2.read()
with open("/content/articles/news3.txt","r+") as t3:
    doc3 = t3.read()
with open("/content/articles/news4.txt","r+") as t4:
    doc4 = t4.read()
with open("/content/articles/news5.txt","r+") as t5:
    doc5 = t5.read()
# Read all 5 txt files which contains news articles
documents = [doc1,doc2,doc3,doc4,doc5]

# **a.Find out the top10 TF-IDF words for the above input.**




In [157]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# using sklearn library which has inbuilt Tfidf vectorizer class which can generate tfidf for given corpus
vect = TfidfVectorizer()
#created TfidfVectorizer object
tfidf_matrix = vect.fit_transform(documents)
#passed list of documents or corpus to obt method fit_transform
df = pd.DataFrame(tfidf_matrix.toarray(), columns = vect.get_feature_names())
# converted method output to panda data frame 
pd.set_option('display.max_columns', 20)

df.loc['Total'] = df.sum() # adding row to value total

#filtering values of words whos tfidf is greater than 0.3
# also used transpose function here to filter out words (which was rows) and then converted matrix back to original version
print (df.T.sort_values('Total', ascending=True).tail(10).T)

           with       bbc      that        is        on       and        in  \
0      0.045378  0.060504  0.068067  0.037815  0.151259  0.189074  0.151259   
1      0.045399  0.084313  0.077827  0.084313  0.116741  0.071341  0.129712   
2      0.090994  0.012999  0.064996  0.077995  0.064996  0.116992  0.246983   
3      0.109023  0.109023  0.051642  0.131975  0.091809  0.281165  0.177880   
4      0.079651  0.112840  0.172578  0.126115  0.106202  0.152665  0.172578   
Total  0.370445  0.379678  0.435110  0.458212  0.531006  0.811237  0.878412   

             of        to       the  
0      0.219325  0.310081  0.408399  
1      0.233481  0.246452  0.586946  
2      0.220985  0.129991  0.428970  
3      0.218046  0.240998  0.447568  
4      0.265505  0.252230  0.497822  
Total  1.157342  1.179752  2.369705  


# **`b.Find out the top10 TF-IDF words for the lemmatized input`**

In [None]:
import nltk;nltk.download('punkt');nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

words1 = nltk.word_tokenize(doc1)
words2 = nltk.word_tokenize(doc2)
words3 = nltk.word_tokenize(doc3)
words4 = nltk.word_tokenize(doc4)
words5 = nltk.word_tokenize(doc5)

lemmatized_document1 = ' '.join([lemmatizer.lemmatize(w) for w in words1])
lemmatized_document2 = ' '.join([lemmatizer.lemmatize(w) for w in words2])
lemmatized_document3 = ' '.join([lemmatizer.lemmatize(w) for w in words3])
lemmatized_document4 = ' '.join([lemmatizer.lemmatize(w) for w in words4])
lemmatized_document5 = ' '.join([lemmatizer.lemmatize(w) for w in words5])

documents = [lemmatized_document1,lemmatized_document2,lemmatized_document3,lemmatized_document4,lemmatized_document5]

# using sklearn library which has inbuilt Tfidf vectorizer class which can generate tfidf for given corpus
vect = TfidfVectorizer()
#created TfidfVectorizer object
tfidf_matrix = vect.fit_transform(documents)
#passed list of documents or corpus to obt method fit_transform
df = pd.DataFrame(tfidf_matrix.toarray(), columns = vect.get_feature_names())
# converted method output to panda data frame 

df.loc['Total'] = df.sum() # adding row to value total

#filtering values of words whos tfidf is greater than 0.3
# also used transpose function here to filter out words (which was rows) and then converted matrix back to original version
print (df.T.sort_values('Total', ascending=True).tail(10).T)

# **c.Find out the top10TF-IDF words for the n-gram based input.**

In [159]:
# this function takes document and n int value to generate list of n grams
def ngrams(input, n):
    input = input.split(' ')
    output = []
    for i in range(len(input)-n+1):
        output.append(input[i:i+n])
    return output

ngram_doc1 = ' '.join([' '.join(x) for x in ngrams(doc1, 3)])
ngram_doc2 = ' '.join([' '.join(x) for x in ngrams(doc2, 3)])
ngram_doc3 = ' '.join([' '.join(x) for x in ngrams(doc3, 3)])
ngram_doc4 = ' '.join([' '.join(x) for x in ngrams(doc4, 3)])
ngram_doc5 = ' '.join([' '.join(x) for x in ngrams(doc5, 3)])

# documents = [ngram_doc1,ngram_doc2,ngram_doc3,ngram_doc4,ngram_doc5]

documents = [doc1,doc2,doc3,doc4,doc5]

# using sklearn library which has inbuilt Tfidf vectorizer class which can generate tfidf for given corpus
vect = TfidfVectorizer( ngram_range=(3,3)) # TfidfVectorizer has inbuilt ngram kwarg which show tfidf for ngrams
#created TfidfVectorizer object
tfidf_matrix = vect.fit_transform(documents)
#passed list of documents or corpus to obt method fit_transform
df = pd.DataFrame(tfidf_matrix.toarray(), columns = vect.get_feature_names())
# converted method output to panda data frame 

df.loc['Total'] = df.sum() # adding row to value total

#filtering values of words whos tfidf is greater than 0.3
# also used transpose function here to filter out words (which was rows) and then converted matrix back to original version
print (df.T.sort_values('Total', ascending=True).tail(10).T)

       us about the  in new york  truth behind the  the dark truth  \
0          0.000000     0.000000          0.025633        0.025633   
1          0.034555     0.000000          0.024586        0.024586   
2          0.049892     0.090157          0.035499        0.035499   
3          0.029643     0.000000          0.021091        0.021091   
4          0.000000     0.039074          0.023078        0.023078   
Total      0.114090     0.129231          0.129887        0.129887   

       the dubai princess  dark truth behind  dubai princess who  \
0                0.025633           0.025633            0.025633   
1                0.024586           0.024586            0.024586   
2                0.035499           0.035499            0.035499   
3                0.021091           0.021091            0.021091   
4                0.023078           0.023078            0.023078   
Total            0.129887           0.129887            0.129887   

       behind the dubai  princes

# **2.Write a simple spark program to read a dataset and find the W2V similar words (words with higher cosine similarity) for the Top10 TF-IDF Words**

In [160]:
from __future__ import print_function
from pyspark import SparkConf, SparkContext
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql import SparkSession
from pyspark.ml.feature import NGram
from pyspark.ml.feature import Word2Vec
# creating spark session
spark = SparkSession.builder.appName("TfIdf Example").getOrCreate()

documentData = spark.createDataFrame([
        (0.0, doc1),
        (0.1, doc2),
        (0.2, doc3),
        (0.3, doc4),
        (0.5, doc5)
    ], ["label", "document"])

# creating tokens/words from the sentence data
tokenizer = Tokenizer(inputCol="document", outputCol="words")
wordsData = tokenizer.transform(documentData)
print (documentData)
wordsData.show()

DataFrame[label: double, document: string]
+-----+--------------------+--------------------+
|label|            document|               words|
+-----+--------------------+--------------------+
|  0.0|Increasingly conc...|[increasingly, co...|
|  0.1|The first bone ca...|[the, first, bone...|
|  0.2|El Chapo's wife E...|[el, chapo's, wif...|
|  0.3|Augmented reality...|[augmented, reali...|
|  0.5|The video call ap...|[the, video, call...|
+-----+--------------------+--------------------+



# **a.Try without NLP**

In [161]:
# applying tf on the words data
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=200)
tf = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors
# calculating the IDF
tf.cache()
idf = IDF(inputCol="rawFeatures", outputCol="features")
idf = idf.fit(tf)
tfidf = idf.transform(tf)
#displaying the results
tfidf.select("label", "features").show()


print("TF-IDF without NLP:")
for each in tfidf.collect():
    print(each)
    print(each['rawFeatures'])
spark.stop()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(200,[0,2,3,4,5,6...|
|  0.1|(200,[0,1,2,3,4,5...|
|  0.2|(200,[0,1,2,3,4,5...|
|  0.3|(200,[0,1,2,3,4,5...|
|  0.5|(200,[0,1,2,3,4,5...|
+-----+--------------------+

TF-IDF without NLP:
Row(label=0.0, document='Increasingly concerned about the environment, for the past few years Alya Annabi, 26, has taken steps to live a more sustainable lifestyle, with the digital learning manager refilling goods at plastic-free stores, making her own skincare from scratch and composting her kitchen waste.\n\nBut in the past year Ms Annabi has decided to take her environmental mission to the next level by tracking her carbon emissions.\n\nUsing an app called Capture, which calculates users\' monthly CO2 targets by asking a series of questions such as how many flights per year you take and what kind of diet you adhere to, and using GPS tracking to predict emissions from transportation, Ms Annabi is able to v

# **b.Try with Lemmatization**

In [162]:
import nltk;nltk.download('punkt');nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

words1 = nltk.word_tokenize(doc1)
words2 = nltk.word_tokenize(doc2)
words3 = nltk.word_tokenize(doc3)
words4 = nltk.word_tokenize(doc4)
words5 = nltk.word_tokenize(doc5)

lemmatized_document1 = ' '.join([lemmatizer.lemmatize(w) for w in words1])
lemmatized_document2 = ' '.join([lemmatizer.lemmatize(w) for w in words2])
lemmatized_document3 = ' '.join([lemmatizer.lemmatize(w) for w in words3])
lemmatized_document4 = ' '.join([lemmatizer.lemmatize(w) for w in words4])
lemmatized_document5 = ' '.join([lemmatizer.lemmatize(w) for w in words5])

### lemmatizing words from 5 input docs same as previos task

# creating spark session
spark = SparkSession.builder.appName("TfIdf Example").getOrCreate()

documentData = spark.createDataFrame([
        (0.0, lemmatized_document1),
        (0.1, lemmatized_document2),
        (0.2, lemmatized_document3),
        (0.3, lemmatized_document4),
        (0.5, lemmatized_document5)
    ], ["label", "document"])

# creating tokens/words from the sentence data
tokenizer = Tokenizer(inputCol="document", outputCol="words")
wordsData = tokenizer.transform(documentData)
print (documentData)
wordsData.show()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
DataFrame[label: double, document: string]
+-----+--------------------+--------------------+
|label|            document|               words|
+-----+--------------------+--------------------+
|  0.0|Increasingly conc...|[increasingly, co...|
|  0.1|The first bone ca...|[the, first, bone...|
|  0.2|El Chapo 's wife ...|[el, chapo, 's, w...|
|  0.3|Augmented reality...|[augmented, reali...|
|  0.5|The video call ap...|[the, video, call...|
+-----+--------------------+--------------------+



In [163]:
# applying tf on the words data
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=200)
tf = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors
# calculating the IDF
tf.cache()
idf = IDF(inputCol="rawFeatures", outputCol="features")
idf = idf.fit(tf)
tfidf = idf.transform(tf)
#displaying the results
tfidf.select("label", "features").show()


print("TF-IDF with Lemmatization:")
for each in tfidf.collect():
    print(each)
    print(each['rawFeatures'])
spark.stop()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(200,[0,1,2,3,4,5...|
|  0.1|(200,[0,1,2,3,4,5...|
|  0.2|(200,[0,1,3,4,5,7...|
|  0.3|(200,[0,1,2,3,4,5...|
|  0.5|(200,[0,1,2,3,4,5...|
+-----+--------------------+

TF-IDF with Lemmatization:
Row(label=0.0, document="Increasingly concerned about the environment , for the past few year Alya Annabi , 26 , ha taken step to live a more sustainable lifestyle , with the digital learning manager refilling good at plastic-free store , making her own skincare from scratch and composting her kitchen waste . But in the past year Ms Annabi ha decided to take her environmental mission to the next level by tracking her carbon emission . Using an app called Capture , which calculates user ' monthly CO2 target by asking a series of question such a how many flight per year you take and what kind of diet you adhere to , and using GPS tracking to predict emission from transportation , Ms Annabi is able to vie

# **c.Try with NGrams**

In [164]:
spark = SparkSession.builder.appName("TfIdf Example").getOrCreate()

documentData = spark.createDataFrame([
        (0.0, doc1.split(' ')),
        (0.1, doc2.split(' ')),
        (0.2, doc3.split(' ')),
        (0.3, doc4.split(' ')),
        (0.5, doc5.split(' '))
    ], ["label", "document"])


ngram = NGram(n=2, inputCol="document", outputCol="ngrams")

ngramDataFrame = ngram.transform(documentData)

# applying tf on the words data
hashingTF = HashingTF(inputCol="ngrams", outputCol="rawFeatures", numFeatures=200)
tf = hashingTF.transform(ngramDataFrame)
# alternatively, CountVectorizer can also be used to get term frequency vectors
# calculating the IDF
tf.cache()
idf = IDF(inputCol="rawFeatures", outputCol="features")
idf = idf.fit(tf)
tfidf = idf.transform(tf)
#displaying the results
tfidf.select("label", "features").show()


print("TF-IDF with ngram:")
for each in tfidf.collect():
    print(each)
    print(each['rawFeatures'])
spark.stop()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(200,[0,1,2,3,4,5...|
|  0.1|(200,[0,1,2,3,4,5...|
|  0.2|(200,[0,1,2,3,4,5...|
|  0.3|(200,[0,1,2,3,4,5...|
|  0.5|(200,[0,1,2,3,4,5...|
+-----+--------------------+

TF-IDF with ngram:
Row(label=0.0, document=['Increasingly', 'concerned', 'about', 'the', 'environment,', 'for', 'the', 'past', 'few', 'years', 'Alya', 'Annabi,', '26,', 'has', 'taken', 'steps', 'to', 'live', 'a', 'more', 'sustainable', 'lifestyle,', 'with', 'the', 'digital', 'learning', 'manager', 'refilling', 'goods', 'at', 'plastic-free', 'stores,', 'making', 'her', 'own', 'skincare', 'from', 'scratch', 'and', 'composting', 'her', 'kitchen', 'waste.\n\nBut', 'in', 'the', 'past', 'year', 'Ms', 'Annabi', 'has', 'decided', 'to', 'take', 'her', 'environmental', 'mission', 'to', 'the', 'next', 'level', 'by', 'tracking', 'her', 'carbon', 'emissions.\n\nUsing', 'an', 'app', 'called', 'Capture,', 'which', 'calculates', "users'", 'mont