# Aim
Understanding the Concept of pipeline by implementing Logistic Regression.

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import RegexTokenizer,IDF,CountVectorizer,StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.pipeline import Pipeline

In [2]:
# Creating a Spark Session
spark=SparkSession.builder.master('local').appName('Logistic Regression').getOrCreate()

## Reading the json file into Spark df

In [3]:
df=spark.read.json('../data/Train_onetag_small.json')
df.persist()

DataFrame[Body: string, Id: bigint, Tags: string, Title: string, oneTag: string]

## Implementing Pipeline

In [5]:
regexTokenizer=RegexTokenizer(inputCol='Body',outputCol='words',pattern="\\W")
cv=CountVectorizer(inputCol='words',outputCol='TF',vocabSize=1000)
idf=IDF(inputCol='TF',outputCol='features')
indexer=StringIndexer(inputCol='oneTag',outputCol='label')

lr=LogisticRegression(maxIter=10,regParam=0.0,elasticNetParam=0)

pipeline=Pipeline(stages=[regexTokenizer,cv,idf,indexer,lr])


## Checking the working of the pipeline

In [6]:
plr_model=pipeline.fit(df)

In [8]:
df2=plr_model.transform(df)
df2.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'which

In [10]:
# Checking the Count of correctly classified labels
df2.filter(df2.label==df2.prediction).count()

36740

For better performance we can increase the Vocab size of our TFIDF features. Also we can improve the performance by implementing HyperParameter Tuning. Check the other ipynb for this. 