In [1]:
import findspark
findspark.init('/home/cse587/spark-2.4.0-bin-hadoop2.7')
import pyspark
from pyspark.sql import *
from pyspark.ml.feature import StopWordsRemover,RegexTokenizer, Word2Vec, Word2VecModel 
import pyspark.sql.functions as f
from pyspark.sql.types import *

In [2]:
spark = SparkSession \
        .builder \
        .appName("PA3") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

In [3]:
import pandas as pd
df = pd.read_csv('train.csv')
traindataframe = spark.createDataFrame(df)

df = pd.read_csv('test.csv')
testdataframe = spark.createDataFrame(df)

In [4]:
regexTokenizer = RegexTokenizer(inputCol="plot", outputCol="words", pattern="\\W")
traindataframe = regexTokenizer.transform(traindataframe)
testdataframe = regexTokenizer.transform(testdataframe)

swremover = StopWordsRemover(inputCol="words", outputCol="filteredwords")
traindataframe = swremover.transform(traindataframe)
testdataframe = swremover.transform(testdataframe)

In [5]:
# word2Vec = Word2Vec(vectorSize = 300, inputCol="filteredwords", outputCol="features")

# trainmodel = word2Vec.fit(traindataframe)
# trainmodel.save('./Models/Word2Vec/Part4/train')
# traindataframe = trainmodel.transform(traindataframe)

# testmodel = word2Vec.fit(testdataframe)
# testmodel.save('./Models/Word2Vec/Part4/test')
# testdataframe = testmodel.transform(testdataframe)

In [6]:
trainmodel = Word2VecModel.load('./Models/Word2Vec/Part4/train')
traindataframe = trainmodel.transform(traindataframe)
testmodel = Word2VecModel.load('./Models/Word2Vec/Part4/test')
testdataframe = testmodel.transform(testdataframe)

In [7]:
traindataframe.select('movie_id','features','genre').show()

+--------+--------------------+--------------------+
|movie_id|            features|               genre|
+--------+--------------------+--------------------+
|23890098|[0.01831449880538...|['World cinema', ...|
|31186339|[0.00456744958522...|['Action/Adventur...|
|20663735|[-0.0367420313129...|['Musical', 'Acti...|
| 2231378|[-0.0091700711571...|          ['Comedy']|
|  595909|[-0.0433986019272...|['Crime Fiction',...|
| 5272176|[-0.0178253222105...|['Action/Adventur...|
| 1952976|[0.01390415212053...|['Thriller', 'Dra...|
|24225279|[0.01318266900042...|           ['Drama']|
| 2462689|[-0.0130439571041...|['Black-and-white...|
|20532852|[0.03913155915136...|['Animation', 'Sh...|
|15401493|[0.01010896813653...|          ['Comedy']|
|18188932|[-0.0040025239122...|['Crime Fiction',...|
| 2940516|[0.01111899777863...|          ['Comedy']|
| 1480747|[0.01134374802789...|          ['Comedy']|
|24448645|[0.00608981265288...|          ['Horror']|
|15072401|[-0.0257594372184...|['Crime Fiction

In [8]:
testdataframe.select('movie_id','features').show()

+--------+--------------------+
|movie_id|            features|
+--------+--------------------+
| 1335380|[0.01829081308152...|
|29062594|[0.01365811865364...|
| 9252321|[0.02314284921792...|
|13455076|[0.02243307114765...|
|24165951|[-0.0095877460553...|
| 1925869|[0.01974596704884...|
|10799612|[0.02829631792485...|
|28238240|[0.02005233356196...|
|17124781|[0.01064345360005...|
|28207941|[-0.0139784252096...|
|19174305|[0.01719887974882...|
|18392317|[0.00911633149819...|
|34420857|[0.04950814312819...|
| 4039635|[0.01262030645992...|
| 8034072|[0.02704978911913...|
| 4016437|[0.02382601307799...|
| 1520023|[0.00884612712464...|
|24589422|[0.03553598551564...|
|35068740|[0.03245457818190...|
|21132951|[0.03732828488346...|
+--------+--------------------+
only showing top 20 rows



In [9]:
mapping = spark.read.load("./mapping.csv", format="csv",sep=",", inferschema="true", header="true")
genre_map = mapping.select("_c0", "0").rdd.collectAsMap()
genre_map

{0: 'Drama',
 1: 'Comedy',
 2: 'Romance Film',
 3: 'Thriller',
 4: 'Action',
 5: 'World cinema',
 6: 'Crime Fiction',
 7: 'Horror',
 8: 'Black-and-white',
 9: 'Indie',
 10: 'Action/Adventure',
 11: 'Adventure',
 12: 'Family Film',
 13: 'Short Film',
 14: 'Romantic drama',
 15: 'Animation',
 16: 'Musical',
 17: 'Science Fiction',
 18: 'Mystery',
 19: 'Romantic comedy'}

In [10]:
import pyspark.sql.functions as F

from pyspark.sql.types import *
def generateLabels(row, i): 
    label = 0
    for x in row[1:-1].split(","):
        if x.strip()[1:-1] == genre_map.get(i):
            label = 1
    return label
udfFunc = F.udf(generateLabels, IntegerType())
for i in range(len(genre_map)):    
    traindataframe = traindataframe.withColumn("label"+str(i), udfFunc("genre",F.lit(i)))
traindataframe.printSchema()

root
 |-- movie_id: long (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- plot: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filteredwords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)
 |-- label0: integer (nullable = true)
 |-- label1: integer (nullable = true)
 |-- label2: integer (nullable = true)
 |-- label3: integer (nullable = true)
 |-- label4: integer (nullable = true)
 |-- label5: integer (nullable = true)
 |-- label6: integer (nullable = true)
 |-- label7: integer (nullable = true)
 |-- label8: integer (nullable = true)
 |-- label9: integer (nullable = true)
 |-- label10: integer (nullable = true)
 |-- label11: integer (nullable = true)
 |-- label12: integer (nullable = true)
 |-- label13: integer (nullable = true)
 |-- label14: integer (nullable = true)
 |-- label15: integer (nullable = tr

In [11]:
def generateLabelCol(): 
    return ""
udfFunc = F.udf(generateLabelCol, StringType())

testdataframe = testdataframe.withColumn("predictions", udfFunc())
testdataframe.printSchema()

root
 |-- movie_id: long (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- plot: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filteredwords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)
 |-- predictions: string (nullable = true)



In [12]:
# from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
# from pyspark.mllib.regression import LabeledPoint
# from pyspark.mllib.linalg import Vector as MLLibVector, Vectors as MLLibVectors

# def parsePoint(line):
#     return LabeledPoint(line.label, MLLibVectors.fromML(line.features))
# for i in range(len(genre_map)):
#     print('classifying Genre '+str(i+1)+ " : "+genre_map.get(i))
#     parsedData = traindataframe.selectExpr("label"+str(i)+" as label", "features").rdd.map(parsePoint)
#     model = LogisticRegressionWithLBFGS.train(parsedData)
#     model.save(spark,'./Models/Part4/model'+str(i))
#     labelsAndPreds = testdataframe.rdd.map(lambda p: (p.movie_id, model.predict(MLLibVectors.fromML(p.features))))
#     label_map = labelsAndPreds.collectAsMap()
#     def addLabel(m_id, row): 
#         row= row + str(label_map.get(m_id))+" "
#         return row
#     udfFunc = F.udf(addLabel, StringType())
#     testdataframe = testdataframe.withColumn("predictions", udfFunc("movie_id","predictions"))
    

In [13]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vector as MLLibVector, Vectors as MLLibVectors

def parsePoint(line):
    return LabeledPoint(line.label, MLLibVectors.fromML(line.features))
for i in range(len(genre_map)):
    print('classifying Genre '+str(i+1)+ " : "+genre_map.get(i))
    model = LogisticRegressionModel.load(spark,'./Models/Part4/model'+str(i))
    labelsAndPreds = testdataframe.rdd.map(lambda p: (p.movie_id, model.predict(MLLibVectors.fromML(p.features))))
    label_map = labelsAndPreds.collectAsMap()
    def addLabel(m_id, row): 
        row= row + str(label_map.get(m_id))+" "
        return row
    udfFunc = F.udf(addLabel, StringType())
    testdataframe = testdataframe.withColumn("predictions", udfFunc("movie_id","predictions"))
    

classifying Genre 1 : Drama
classifying Genre 2 : Comedy
classifying Genre 3 : Romance Film
classifying Genre 4 : Thriller
classifying Genre 5 : Action
classifying Genre 6 : World cinema
classifying Genre 7 : Crime Fiction
classifying Genre 8 : Horror
classifying Genre 9 : Black-and-white
classifying Genre 10 : Indie
classifying Genre 11 : Action/Adventure
classifying Genre 12 : Adventure
classifying Genre 13 : Family Film
classifying Genre 14 : Short Film
classifying Genre 15 : Romantic drama
classifying Genre 16 : Animation
classifying Genre 17 : Musical
classifying Genre 18 : Science Fiction
classifying Genre 19 : Mystery
classifying Genre 20 : Romantic comedy


In [14]:
testdataframe.select('movie_id','predictions').show(20, False)

+--------+----------------------------------------+
|movie_id|predictions                             |
+--------+----------------------------------------+
|1335380 |1 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 |
|29062594|1 1 0 0 0 0 1 0 1 1 0 0 0 1 0 1 1 1 1 0 |
|9252321 |1 1 0 0 0 1 0 0 1 0 0 0 1 1 0 1 0 0 1 0 |
|13455076|1 1 0 0 0 1 0 0 1 1 0 0 0 1 0 1 1 1 1 1 |
|24165951|1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 |
|1925869 |1 1 0 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 0 0 |
|10799612|1 1 0 0 0 1 0 0 1 1 0 0 0 1 0 1 1 0 0 0 |
|28238240|1 1 0 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 0 0 |
|17124781|1 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 |
|28207941|1 0 1 1 1 0 1 0 0 1 1 1 0 1 1 1 1 1 1 0 |
|19174305|1 1 0 0 0 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 |
|18392317|1 1 0 0 0 1 0 0 0 1 0 0 0 1 0 1 0 1 1 1 |
|34420857|1 1 0 0 0 1 0 0 1 1 0 0 1 1 0 1 1 0 0 0 |
|4039635 |1 1 0 0 0 1 0 0 1 1 0 0 0 1 0 1 0 1 0 0 |
|8034072 |1 1 0 0 0 1 0 0 1 1 0 0 1 1 0 1 0 0 0 0 |
|4016437 |1 1 0 0 0 1 0 0 1 0 0 0 1 1 0 1 0 1 0 0 |
|1520023 |1 

In [15]:
testdataframe.select('movie_id','predictions').write.format("csv").option("header", "true").mode("append").save("outputs/output4")