## Part 3 : Using Word2vec for feature engineering and model was built using Random Forest Algorithm

In [1]:
import findspark

In [2]:
findspark.init('/home/cse587/spark-2.4.0-bin-hadoop2.7')

In [3]:
import pyspark

In [4]:
from pyspark.sql import *

# Build the SparkSession
spark = SparkSession.builder \
   .appName("Assignment3p3") \
   .config("spark.some.config.option", "8gb") \
   .getOrCreate()
   


In [5]:
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [6]:
from pyspark.sql import SQLContext
sqlcontext=SQLContext(sc)

## Loading necessary libraries for this part

In [7]:
import numpy as np
import pandas as pd
import re
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
import re
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
import numpy as np
from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.sql.functions import udf, col, lower, regexp_replace
from pyspark.sql import functions as F
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, VectorIndexer, IndexToString
from pyspark.ml.feature import Word2Vec

## Loading training dataset

In [8]:

train_data = pd.read_csv("/home/cse587/Project3/train.csv",sep=",", header = 0)

In [9]:
train_data.head()

Unnamed: 0,movie_id,movie_name,plot,genre
0,23890098,Taxi Blues,"Shlykov, a hard-working taxi driver and Lyosha...","['World cinema', 'Drama']"
1,31186339,The Hunger Games,The nation of Panem consists of a wealthy Capi...,"['Action/Adventure', 'Action', 'Science Fictio..."
2,20663735,Narasimham,Poovalli Induchoodan is sentenced for six yea...,"['Musical', 'Action', 'Drama']"
3,2231378,The Lemon Drop Kid,"The Lemon Drop Kid , a New York City swindler,...",['Comedy']
4,595909,A Cry in the Dark,Seventh-day Adventist Church pastor Michael Ch...,"['Crime Fiction', 'World cinema', 'Drama']"


# Converting pandas dataframe to pyspark dataframe

In [10]:
training_df = spark.createDataFrame(train_data)
# training_df.head(1)

## Cleaning the data

In [11]:

regexTokenizer = RegexTokenizer(inputCol="plot", outputCol="words", pattern="\\W")

wordsData = regexTokenizer.transform(training_df)



In [13]:

wordsData.show(1)

+--------+----------+--------------------+--------------------+--------------------+
|movie_id|movie_name|                plot|               genre|               words|
+--------+----------+--------------------+--------------------+--------------------+
|23890098|Taxi Blues|Shlykov, a hard-w...|['World cinema', ...|[shlykov, a, hard...|
+--------+----------+--------------------+--------------------+--------------------+
only showing top 1 row



## Fitting and transforming Word2vec to the modified data

In [12]:
word2vec = Word2Vec(vectorSize = 100, minCount = 5, inputCol = 'words', outputCol = 'features')
model = word2vec.fit(wordsData)
word2vec_data = model.transform(wordsData)

In [20]:
word2vec_data.head(1)

[Row(movie_id=23890098, movie_name='Taxi Blues', plot="Shlykov, a hard-working taxi driver and Lyosha, a saxophonist, develop a bizarre love-hate relationship, and despite their prejudices, realize they aren't so different after all.", genre="['World cinema', 'Drama']", words=['shlykov', 'a', 'hard', 'working', 'taxi', 'driver', 'and', 'lyosha', 'a', 'saxophonist', 'develop', 'a', 'bizarre', 'love', 'hate', 'relationship', 'and', 'despite', 'their', 'prejudices', 'realize', 'they', 'aren', 't', 'so', 'different', 'after', 'all'], result=DenseVector([-0.0093, -0.0276, 0.0352, -0.0737, 0.0013, -0.0538, 0.0286, 0.0399, 0.0213, 0.0282, 0.0882, -0.0535, 0.0088, 0.0018, -0.0203, 0.006, -0.011, -0.0338, 0.0361, -0.1029, 0.0324, -0.0044, 0.0606, 0.0576, 0.0389, 0.0091, -0.0722, 0.0204, 0.0166, 0.0138, -0.0051, 0.0088, 0.025, -0.0762, 0.1317, 0.0337, 0.0803, 0.0133, -0.0832, 0.0594, 0.044, 0.0837, -0.0601, -0.1051, 0.0032, -0.0801, -0.0658, -0.1003, 0.0344, -0.0654, 0.0645, 0.039, -0.0146, 0.04

Converting genre into labels using StringIndexer

In [13]:
label_stringIdx = StringIndexer(inputCol = "genre", outputCol = "label")
fitter = label_stringIdx.fit(word2vec_data)
rescaledData = fitter.transform(word2vec_data)
labels = fitter.labels

## Fitting Machine Learning model on the training dataset

In [14]:
(training_data,testing_data) = rescaledData.randomSplit([0.7,0.3], seed=4234)

In [15]:
## Fitting Random Forest model on the training data

from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=20, maxBins=100)
model = rf.fit(training_data)

## Saving the trained model

In [16]:
model.save("/home/cse587/Project3/randomforest_for_part3")

In [18]:
## fitting the obtained model on validation test data
predictions = model.transform(training_data)


In [19]:
## Evaluating the accuracy of predicted labels
evaluator = MulticlassClassificationEvaluator(labelCol = "label", predictionCol="prediction")
evaluator.evaluate(predictions)

0.046310743304631356

## Loading test data from local using Pandas and converted into pyspark dataframe

In [22]:

test_data = pd.read_csv("/home/cse587/Project3/test.csv",sep=",", header = 0)
testing_df = spark.createDataFrame(test_data)
# testing_df.head(1)

## pre-processing the test data

In [27]:
regexTokenizer = RegexTokenizer(inputCol="plot", outputCol="words", pattern="\\W")
word2vec = Word2Vec(vectorSize = 100, minCount = 5, inputCol = 'words', outputCol = 'features')

pipeline = Pipeline(stages=[regexTokenizer, word2vec])

idfModel = pipeline.fit(training_df)
test_data2 = idfModel.transform(testing_df)



In [30]:
test_result.show(1)

+--------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+----------------+
|movie_id|movie_name|                plot|               words|            features|       rawPrediction|         probability|prediction|originalCategory|
+--------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+----------------+
| 1335380|    Exodus|The film is based...|[the, film, is, b...|[0.03900388203832...|[2.29609556569081...|[0.11480477828454...|       0.0|       ['Drama']|
+--------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+----------------+
only showing top 1 row



## Predicting the test labels by fitting the trained model

In [29]:
test_pred2 = model.transform(test_data2)

converter = IndexToString(inputCol="prediction", outputCol="originalCategory", labels = labels)
test_result = converter.transform(test_pred2)
# test_result.show(2)

## Loading the mapping.csv into a dataframe

In [31]:
mapping = pd.read_csv('/home/cse587/Project3/mapping.csv')
colnames = ['label','name']
mapping.columns =colnames
mapping_df = sqlContext.createDataFrame(mapping)
# valuelist = [row.name for row in mapping_df.collect()]
mapping.head()


Unnamed: 0,number,name
0,0,Drama
1,1,Comedy
2,2,Romance Film
3,3,Thriller
4,4,Action


In [32]:
def convert(lines):
    lines = lines[1:-1].split(', ')
    temp=[0]*20
    for i in lines:   
        for k,v in mapping.iterrows():
            if i == ('\''+ v['name'] +'\''):
                temp[k] = 1
                continue
    return " ".join(map(str,temp))

converter = udf(convert)
test_result = test_result.withColumn("predictions",converter('originalCategory'))

## Extracting the final dataframe

In [33]:
part2_result = test_result.select("movie_id","predictions")


## Saving the predicted results in .csv file

In [34]:
part2_result.write.csv('part3_test_result_rf.csv', header = True)