

<h1><center> CSE 487/587 Assignment 3: Predictive Analytics with Spark</center></h1>

<h1><center> PART 1 - Basic Model<center></h1>

## Importing packages and checking the version

In [None]:
# Checking the JAVA version
!java -version

In [None]:
# Import the findspark and pyspark
import findspark
findspark.init('/home/cse587/spark-2.4.0-bin-hadoop2.7')
import pyspark

In [None]:
pyspark

## Importing Libraries

In [None]:
import pandas as pd
from pyspark.sql import SparkSession 
from pyspark.sql.functions import col, lower, regexp_replace, split, size,array
from pyspark.sql.functions import udf
from pyspark.sql.functions import UserDefinedFunction
import pyspark.sql.functions as sf
from pyspark.sql.types import StringType,IntegerType
from pyspark.ml import Pipeline
from pyspark.ml.feature import  Tokenizer, StopWordsRemover,HashingTF, IDF, Word2Vec
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier

## Creating Pyspark Session

In [None]:
# Setting the max memory to 8gb RAM
MAX_MEMORY = "8g"

# Creating the spark session
spark = SparkSession \
    .builder \
    .config("spark.executor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .getOrCreate()

from pyspark.sql import SQLContext
sqlContext = SQLContext(spark)

## Importing Train, Test and Mapping datasets

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
mapping = pd.read_csv("mapping.csv",names=["genre_id", "genre"])

## Checking datasets

In [None]:
train.head()

In [None]:
test.head()

In [None]:
mapping.head()

## Converting pandas datasets to PySpark

In [None]:
spark_df = sqlContext.createDataFrame(train)
spark_test_df= sqlContext.createDataFrame(test)

In [None]:
spark_df.show(2)

In [None]:
spark_test_df.show(2)

## select particular columns

In [None]:
spark_df.select("movie_id","movie_name","plot","genre").show() 

## Data Preprocessing

### Data Cleaning 

In [None]:
# Using pyspark.sql.functions regexp_replace to remove punctuations and special characters
def clean_text(c):
    c = lower(c)
    c = regexp_replace(c, "^rt ", "")
    c = regexp_replace(c, "(https?\://)\S+", "")
    c = regexp_replace(c, "[^a-zA-Z0-9\\s]", "")
    return c

# We will do this for only "plot" column of the dataframe
# Making the change in the train data
clean_text_df = spark_df.select(("movie_id"),("movie_name"),clean_text(col("plot")).alias("plot"),("genre"))
# Making the change in the test data
clean_test_df = spark_test_df.select(("movie_id"),("movie_name"),clean_text(col("plot")).alias("plot"))
clean_text_df.printSchema()
clean_text_df.show(5)
clean_test_df.printSchema()
clean_test_df.show(5)

## We will first create the pipeline model which defines the step by step approach towards feature engineering

In [None]:
# Step 1 - Tokenizer
# This will separate each word of the plot into a list of words
tokenizer = Tokenizer(inputCol="plot", outputCol="words")

#Step 2 - Removing stopwords from the tokenized words
remover = StopWordsRemover()
stopwords = remover.getStopWords() 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(stopwords)

#Step 3 - Creating term frequency of the stopwords removed tokenized words so here we will be using hashingTF
hashing = HashingTF(inputCol="filtered", outputCol="rawFeatures",numFeatures= 100)

#Step 4 - Creating the pipeline for the above approach
tfmodel_pipeline = Pipeline(stages =[tokenizer, stopwordsRemover,hashing])

## Preparing Feature data for Train data

In [None]:
#Fitting the pipeline to the train data
model1 = tfmodel_pipeline.fit(clean_text_df) 
# Transforming the train data and creating separate dataframe of it
featurizedData = model1.transform(clean_text_df)
featurizedData.show(5)

## Preparing Feature data for Test data

In [None]:
#Fitting the pipeline to the train data
model2 = tfmodel_pipeline.fit(clean_test_df) 
# Transforming the train data and creating separate dataframe of it
testData = model2.transform(clean_test_df)
testData.show(5)

In [None]:
# Selecting only necessary columns
featurizedData= featurizedData.select("movie_id","movie_name","plot","genre","rawFeatures")

In [None]:
featurizedData.show()

## Label Processing start

In [None]:
# We will use mapping.csv to process our genrelist and convert it to the labels having binary number like string telling specifying the multilabels of the given movie plot
mapping=mapping[1:]
mapping_spark = sqlContext.createDataFrame(mapping)
mapping_spark.toPandas().set_index('genre_id').T.to_dict()

#taking the genre column in the list
mapping_genre_list=mapping_spark.select("genre").collect()

#extracting the genre name into the list
genreList=[]
for i in range(0,mapping_spark.count()):
    genreList.append(mapping_genre_list[i][0])

In [None]:
genreList

In [None]:
# This will convert the genre to the binary like string(1 and 0) specifying multilabels of the given movie_id
def oneHotEncoding(x):
    indexList=" "
    for genre in genreList:
        if genre in x:
            indexList=indexList+"1"
        else:
            indexList=indexList+"0"
    temp=(indexList.replace("", " ")[1: -1])
    return temp.strip(" ")

## Mapping Label with feature vectors

In [None]:
myfunction = UserDefinedFunction(lambda x: oneHotEncoding(x), StringType())
featurizedData = featurizedData.select(*[myfunction(col).alias("labels") if col == "genre" else col for col in featurizedData.columns])
featurizedData.show()

In [None]:
label=featurizedData.select("movie_id","labels")

In [None]:
label.show()

## Splitting Label columns

In [None]:
from pyspark.sql.types import StringType,IntegerType
split_col = pyspark.sql.functions.split(label['labels'],' ')
for i in range(0,len(mapping_genre_list)):
    label = label.withColumn('label{}'.format(i), split_col.getItem(i).cast(IntegerType()))

In [None]:
label.show()

## Joining Feature vector column with label columns

In [None]:
featurizedData=featurizedData.drop('labels')
df  = featurizedData.join(label, label.movie_id == featurizedData.movie_id).drop(label.movie_id)

In [None]:
df.show()

## Function for Random Forest Classifier

In [None]:
def fit_and_classify(df, labelcol, featurescol):
    RandomForest_classifier = RandomForestClassifier(labelCol = labelcol, featuresCol= featurescol,numTrees=3,maxDepth=2)
    RandomForest_classifier_pipeline = Pipeline(stages=[RandomForest_classifier])
    RandomForest_model = RandomForest_classifier_pipeline.fit(df)
    return RandomForest_model

## Training the Model

In [None]:
# We will train the model for each label column and take the prediction
# As we are using Random Forest Classifier we will need to balance our data so that our model performs better
# So here we will do undersamlpling to make the data balanced

label_to_append = []
for i in range(20):
    filter_df = df.select('movie_id','rawFeatures','label{}'.format(i))
    #Dividing the filtered_df to check for the ratio
    major_df = filter_df.filter(filter_df['label{}'.format(i)] == 1) # Selecting the rows with labels only 1
    minor_df = filter_df.filter(filter_df['label{}'.format(i)] == 0) # Selecting the rows with labels only 0
    if minor_df.count() > major_df.count(): 
        ratio = minor_df.count() / major_df.count()
        sampled_df = minor_df.sample(False, 1/ratio) # Sampling the data as per the ratio and setting withReplacement to False
        final_df = sampled_df.union(major_df) # creating a final dataframe for our  random forest classifer model
    else:
        ratio = major_df.count() / minor_df.count()
        sampled_df = major_df.sample(False, 1/ratio)# Sampling the data as per the ratio and setting withReplacement to False
        final_df = sampled_df.union(minor_df) # creating a final dataframe for our  random forest classifer model
    RandomForest_model = fit_and_classify(final_df, labelcol='label{}'.format(i), featurescol='rawFeatures')
    predictions = RandomForest_model.transform(testData) # Getting the predictions
    label_to_append.append(predictions.select('movie_id', 'prediction')) # Selecting only movie_id and prediction column

In [None]:
# Preparing the header row for each of the predicted label column (so total 20 prediction column)

for i in range(len(label_to_append)):
    label_to_append[i] = label_to_append[i].withColumnRenamed('prediction','prediction{}'.format(i))

In [None]:
# Joining the data

joined_df = label_to_append[0]
for i in range(1,len(label_to_append)):
    joined_df = joined_df.join(label_to_append[i],on = ['movie_id'], how = 'inner')

## Assembling the predictions

In [None]:
from pyspark.ml.feature import VectorAssembler

vecAssembler = VectorAssembler(inputCols = ["prediction0","prediction1","prediction2","prediction3","prediction4","prediction5","prediction6","prediction7","prediction8","prediction9","prediction10","prediction11","prediction12","prediction13","prediction14","prediction15","prediction16","prediction17","prediction18","prediction19"], outputCol = 'Predictions')
joined_df = vecAssembler.transform(joined_df)

In [None]:
joined_df.show(5)

In [None]:
result_df = joined_df.withColumn('predictions', 
                    sf.concat((sf.col('prediction0')).cast(IntegerType()),sf.lit(' '), sf.col('prediction1').cast(IntegerType()),sf.lit(' '), sf.col('prediction2').cast(IntegerType()),sf.lit(' '), sf.col('prediction3').cast(IntegerType()),sf.lit(' '), sf.col('prediction4').cast(IntegerType()),sf.lit(' '), sf.col('prediction5').cast(IntegerType()),sf.lit(' '), sf.col('prediction6').cast(IntegerType()),sf.lit(' '),sf.col('prediction7').cast(IntegerType()),sf.lit(' '), sf.col('prediction8').cast(IntegerType()),sf.lit(' '), sf.col('prediction9').cast(IntegerType()),sf.lit(' '), sf.col('prediction10').cast(IntegerType()),sf.lit(' '), sf.col('prediction11').cast(IntegerType()),sf.lit(' '), sf.col('prediction12').cast(IntegerType()),sf.lit(' '), sf.col('prediction13').cast(IntegerType()),sf.lit(' '),sf.col('prediction14').cast(IntegerType()),sf.lit(' '), sf.col('prediction15').cast(IntegerType()),sf.lit(' '), sf.col('prediction16').cast(IntegerType()),sf.lit(' '), sf.col('prediction17').cast(IntegerType()),sf.lit(' '), sf.col('prediction18').cast(IntegerType()),sf.lit(' '), sf.col('prediction19').cast(IntegerType())))


In [None]:
result_df=result_df.select("movie_id","predictions")

In [None]:
result_df.show(5)

## Saving the result dataframe into CSV file

In [None]:
result_df.coalesce(1).write.format("csv").option("header", "true").save("DIC_Assignment_3_Part1")