# Task
Build a ML LinearRegression Model that predicts the number of tags present in a particular column(i.e combination of Title and Body) based on the number of words present in the Combined Col.

In [49]:
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf,col,concat,lit,avg,stddev,mean,max,min
from pyspark.ml.feature import RegexTokenizer,VectorAssembler
from pyspark.ml.regression import LinearRegression

In [3]:
# Creating a Spark Session
spark=SparkSession.builder.master('local').appName('LinearRegression').getOrCreate()

## Reading the JSON file in Spark DF

In [4]:
df=spark.read.json('../data/Train_onetag_small.json')
df.columns

['Body', 'Id', 'Tags', 'Title', 'oneTag']

## Data preparation

In [5]:
# Let's combine both the Title and Body columns into one.
df=df.withColumn('Combined',concat(df.Title,lit(" "),df.Body))
df.head(2)

[Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', Combined="How to check if an uploaded file is an image without mime type? <p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an

In [11]:
# Let's Tokenize the words in the combined columns
regextokenizer=RegexTokenizer(inputCol='Combined',outputCol='Combined_words',pattern="\\W")
df=regextokenizer.transform(df)
df.head(2)

[Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', Combined="How to check if an uploaded file is an image without mime type? <p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an

In [13]:
# Counting the number of words present in the combined words column
word_count=udf(lambda x:len(x),IntegerType())
df=df.withColumn('BodyLength',word_count(df.Combined_words))
df.head(2)

[Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', Combined="How to check if an uploaded file is an image without mime type? <p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an

In [14]:
# Let's check the number of tags present in each Tags Column
# Modifying word_count f'n a lil
word_tag=udf(lambda x : len(x.split(" ")),IntegerType())
df=df.withColumn('NumTags',word_tag(df.Tags))
df.select('Tags','BodyLength','NumTags').head(2)

[Row(Tags='php image-processing file-upload upload mime-types', BodyLength=96, NumTags=5),
 Row(Tags='firefox', BodyLength=83, NumTags=1)]

In [17]:
# Checking differenct values in NumTags column
df.groupby('NumTags').count().sort('NumTags').show()

+-------+-----+
|NumTags|count|
+-------+-----+
|      1|13858|
|      2|26540|
|      3|28769|
|      4|19108|
|      5|11725|
+-------+-----+



In [20]:
# Now we'll see if our assumtion i.e. the number of tags is linearly related to the body length holds true or not.
df.groupby('NumTags').agg(avg(col('BodyLength'))).sort('NumTags').show()

+-------+------------------+
|NumTags|   avg(BodyLength)|
+-------+------------------+
|      1|143.68776158175783|
|      2| 162.1539186134137|
|      3|181.26021064340088|
|      4|201.46530249110322|
|      5|227.64375266524522|
+-------+------------------+



In [26]:
# Before creating a Data for training all the features needs to be places together in a Vector type field
# For now we'll be using just one feature say bodylength
assembler=VectorAssembler(inputCols=['BodyLength'],outputCol='LengthFeature')
df=assembler.transform(df)
df.head(2)

[Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', Combined="How to check if an uploaded file is an image without mime type? <p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an

In [35]:
# Let's create our data i.e. to be fitted
data=df.select(col('LengthFeature').alias('features'),col('NumTags').alias('label'))
data.head(2)

[Row(features=DenseVector([96.0]), label=5),
 Row(features=DenseVector([83.0]), label=1)]

In [36]:
# Creating an object of the LR model.
lr=LinearRegression(maxIter=5,regParam=0.0,fitIntercept=False,solver='normal')
lrmodel=lr.fit(data)

In [31]:
lrmodel.intercept,lrmodel.coefficients

(0.0, DenseVector([0.0079]))

In [32]:
# To check the r2 value 
lrmodel.summary.r2

0.4455149596308462

# Quick Quiz

Q1.How many times greater is the Description Length of the longest question than the Description Length of the 
shortest question (rounded to the nearest whole number)?


In [51]:
df.groupby('BodyLength').count().agg(max(col('BodyLength')),min(col('BodyLength'))).show()

+---------------+---------------+
|max(BodyLength)|min(BodyLength)|
+---------------+---------------+
|           7532|             10|
+---------------+---------------+



Q2. What is the mean and standard deviation of the Description length?

In [46]:
df.agg(avg(col('BodyLength')),mean(col('BodyLength')),stddev(col('BodyLength'))).collect()

[Row(avg(BodyLength)=180.28187, avg(BodyLength)=180.28187, stddev_samp(BodyLength)=192.10819533505023)]