# Screencast Code
O código a seguir é o mesmo usado no screencast "Recursos numéricos". Execute cada célula de código para ver como

In [57]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer, VectorAssembler, \
    Normalizer, StandardScaler, CountVectorizer, IDF, StringIndexer, PCA
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import udf, avg, col 
from pyspark.sql.types import IntegerType

import re

In [2]:
# create a SparkSession: note this step was left out of the screencast
spark = SparkSession.builder \
    .master("local") \
    .appName("Word Count") \
    .getOrCreate()

## Read in the Data Set

In [3]:
stack_overflow_data = 'data/Train_onetag_small.json'

In [4]:
df = spark.read.json(stack_overflow_data)

In [5]:
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php')

## Tokenization
A tokenização divide as strings em palavras separadas. Spark tem uma classe [Tokenizer](https://spark.apache.org/docs/latest/ml-features.html#tokenizer), bem como RegexTokenizer, que permite mais controle sobre o processo de tokenização.

In [6]:
# Split the body text into separate words

regexTokenizer = RegexTokenizer(inputCol="Body", outputCol="words", pattern="\\W")
df = regexTokenizer.transform(df)
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'which

In [7]:
# count the number of words in each body tag
body_length = udf(lambda x: len(x), IntegerType())
df = df.withColumn("BodyLength", body_length(df.words))

In [8]:
# count the number of paragraphs and links in each body tag

number_of_paragraphs = udf(lambda x: len(re.findall("</p>", x)), IntegerType())
number_of_links = udf(lambda x: len(re.findall("</a>", x)), IntegerType())

In [9]:
df = df.withColumn("NumParagraphs", number_of_paragraphs(df.Body))
df = df.withColumn("NumLinks", number_of_links(df.Body))

In [10]:
df.head(2)

[Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'whic

# Normalize the Vector

In [11]:
assembler = VectorAssembler(inputCols=["BodyLength", "NumParagraphs", "NumLinks"], outputCol="NumFeatures")
df = assembler.transform(df)

In [12]:
df.head(2)

[Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'whic

## Scale the Vectors

In [13]:
scaler2 = StandardScaler(inputCol='NumFeatures', 
                         outputCol="ScaleNumFeatures2",
                        withStd=True)
scalerModel = scaler2.fit(df)
df = scalerModel.transform(df)

In [14]:
df.head(2)

[Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'whic

## CountVectorizer

In [15]:
cv = CountVectorizer(inputCol='words',outputCol='TF',vocabSize = 1000)
cvmodel = cv.fit(df)
df = cvmodel.transform(df)
df.take(1)

[Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'whic

In [16]:
cvmodel.vocabulary

['p',
 'the',
 'i',
 'to',
 'code',
 'a',
 'gt',
 'lt',
 'is',
 'and',
 'pre',
 'in',
 'this',
 'of',
 'it',
 'that',
 'for',
 '0',
 '1',
 'have',
 'my',
 'if',
 'on',
 'but',
 'with',
 'can',
 'not',
 'be',
 'as',
 't',
 'li',
 'from',
 '2',
 's',
 'http',
 'an',
 'm',
 'strong',
 'new',
 'how',
 'do',
 'com',
 'so',
 'or',
 'at',
 'using',
 'when',
 'am',
 'like',
 'class',
 'id',
 'there',
 'get',
 'are',
 'name',
 'what',
 'any',
 'file',
 'string',
 'data',
 'all',
 'which',
 'want',
 'would',
 'amp',
 'use',
 'java',
 'function',
 'public',
 'some',
 '3',
 'text',
 'error',
 'android',
 'value',
 'c',
 'x',
 'href',
 'you',
 'one',
 'by',
 'user',
 'me',
 'server',
 'type',
 'here',
 'way',
 'return',
 'int',
 'will',
 'div',
 'need',
 'then',
 'set',
 'e',
 'system',
 'has',
 'problem',
 'out',
 'php',
 'no',
 'just',
 '4',
 'org',
 'know',
 'html',
 'only',
 'where',
 'page',
 'application',
 '5',
 'thanks',
 'var',
 'br',
 'we',
 'd',
 'should',
 'does',
 'add',
 'n',
 'true',

In [17]:
cvmodel.vocabulary[-10:]

['customer',
 'desktop',
 'buttons',
 'previous',
 'math',
 'master',
 '000',
 'blog',
 'comes',
 'wordpress']

## Inter-document Frequency

In [18]:
idf = IDF(inputCol='TF', outputCol='TFIDF')
idfModel = idf.fit(df)
df = idfModel.transform(df)
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'which

## String Indexer

In [19]:
indexer = StringIndexer(inputCol="oneTag", outputCol="label")
df = indexer.fit(df).transform(df)

In [20]:
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'which

## PCA

In [27]:
pca = PCA(k=100, inputCol="TFIDF", outputCol="pcaTFIDF")
model = pca.fit(df)
df = model.transform(df)

In [28]:
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'which

In [31]:
number_of_tags = udf(lambda x : len(x.split(" ")), IntegerType())
df = df.withColumn("NumTags", number_of_tags(df.Tags))

In [32]:
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'which

In [33]:
df.groupby('NumTags').count().orderBy("NumTags").show()

+-------+-----+
|NumTags|count|
+-------+-----+
|      1|13858|
|      2|26540|
|      3|28769|
|      4|19108|
|      5|11725|
+-------+-----+



In [39]:
df.groupBy("NumTags").agg(avg(col('BodyLength'))).orderBy("NumTags").show()

+-------+------------------+
|NumTags|   avg(BodyLength)|
+-------+------------------+
|      1|135.41311877615817|
|      2|153.82456669178598|
|      3|172.73704334526747|
|      4|192.67050450073268|
|      5|218.54251599147122|
+-------+------------------+



In [41]:
assembler = VectorAssembler(inputCols=['BodyLength'],
                           outputCol='LengthFeature')
df = assembler.transform(df)
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'which

## Regressão Linear

In [47]:
lr = LinearRegression(maxIter=5, regParam=0.0, fitIntercept=False,
                     solver = 'normal')

In [48]:
data = df.select(col("NumTags").alias("label"), 
                 col("LengthFeature").alias("features"))
data.head()

Row(label=5, features=DenseVector([83.0]))

In [49]:
lrModel = lr.fit(data)

Exception ignored in: <function JavaWrapper.__del__ at 0x7fa5a95579d0>
Traceback (most recent call last):
  File "/home/rterra/anaconda3/lib/python3.8/site-packages/pyspark/ml/wrapper.py", line 39, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'VectorAssembler' object has no attribute '_java_obj'


In [50]:
lrModel.coefficients

DenseVector([0.0079])

In [51]:
lrModel.intercept

0.0

In [52]:
lrModelSummary = lrModel.summary

In [53]:
lrModelSummary.r2

0.42481762576079996

## Logistic Regression

In [54]:
data2 = df.select(col("label").alias("label"), 
                 col("TFIDF").alias("features"))
data2.head()

Row(label=3.0, features=SparseVector(1000, {0: 0.0026, 1: 0.7515, 2: 0.1374, 3: 0.3184, 5: 0.3823, 8: 1.0754, 9: 0.3344, 15: 0.5899, 21: 1.8551, 28: 1.1263, 31: 1.1113, 35: 3.3134, 36: 1.2545, 43: 2.3741, 45: 2.3753, 48: 1.2254, 51: 1.1879, 57: 11.0264, 61: 2.8957, 71: 2.1945, 78: 1.6947, 84: 6.5898, 86: 1.6136, 94: 2.3569, 97: 1.8218, 99: 2.6292, 100: 1.9206, 115: 2.3592, 147: 5.4841, 152: 2.1116, 169: 2.6328, 241: 2.5745, 283: 3.2325, 306: 3.2668, 350: 6.2367, 490: 3.8893, 578: 3.6182, 759: 3.7771, 832: 8.8964}))

In [58]:
lr2 = LogisticRegression(maxIter=10, regParam=0.0)

In [None]:
lrModel2 = lr2.fit(data2)

In [None]:
lrModel2.coefficients

In [None]:
lrModel2.intercoptVector

In [None]:
lrModel2.summary.accuracy