In [1]:
import findspark
findspark.init('/home/cse587/spark-2.4.0-bin-hadoop2.7')
import pyspark
from pyspark.sql import *
from pyspark.ml.feature import StopWordsRemover,RegexTokenizer, StringIndexer, CountVectorizer, HashingTF, IDF
import pyspark.sql.functions as f
from pyspark.sql.types import *

In [2]:
spark = SparkSession \
        .builder \
        .appName("PA3") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

In [3]:
import pandas as pd
df = pd.read_csv('train.csv')
traindataframe = spark.createDataFrame(df)

df = pd.read_csv('test.csv')
testdataframe = spark.createDataFrame(df)

In [4]:
regexTokenizer = RegexTokenizer(inputCol="plot", outputCol="words", pattern="\\W")
traindataframe = regexTokenizer.transform(traindataframe)
testdataframe = regexTokenizer.transform(testdataframe)

swremover = StopWordsRemover(inputCol="words", outputCol="filteredwords")
traindataframe = swremover.transform(traindataframe)
testdataframe = swremover.transform(testdataframe)

hashingTF = HashingTF(inputCol="filteredwords", outputCol="rawFeatures")
traindataframe = hashingTF.transform(traindataframe)
testdataframe = hashingTF.transform(testdataframe)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(traindataframe)
traindataframe = idfModel.transform(traindataframe)
testdataframe = idfModel.transform(testdataframe)

In [5]:
traindataframe.select('movie_id','features','genre').show()

+--------+--------------------+--------------------+
|movie_id|            features|               genre|
+--------+--------------------+--------------------+
|23890098|(262144,[2437,127...|['World cinema', ...|
|31186339|(262144,[991,1739...|['Action/Adventur...|
|20663735|(262144,[119,571,...|['Musical', 'Acti...|
| 2231378|(262144,[619,1998...|          ['Comedy']|
|  595909|(262144,[1911,243...|['Crime Fiction',...|
| 5272176|(262144,[571,4977...|['Action/Adventur...|
| 1952976|(262144,[343,3294...|['Thriller', 'Dra...|
|24225279|(262144,[2437,251...|           ['Drama']|
| 2462689|(262144,[1156,433...|['Black-and-white...|
|20532852|(262144,[929,3924...|['Animation', 'Sh...|
|15401493|(262144,[1493,409...|          ['Comedy']|
|18188932|(262144,[4211,118...|['Crime Fiction',...|
| 2940516|(262144,[3617,138...|          ['Comedy']|
| 1480747|(262144,[14,382,1...|          ['Comedy']|
|24448645|(262144,[5463,859...|          ['Horror']|
|15072401|(262144,[6183,636...|['Crime Fiction

In [6]:
testdataframe.select('movie_id','features').show()

+--------+--------------------+
|movie_id|            features|
+--------+--------------------+
| 1335380|(262144,[1728,261...|
|29062594|(262144,[6068,191...|
| 9252321|(262144,[1598,208...|
|13455076|(262144,[3294,618...|
|24165951|(262144,[4098,644...|
| 1925869|(262144,[535,3294...|
|10799612|(262144,[5053,538...|
|28238240|(262144,[23060,30...|
|17124781|(262144,[5232,733...|
|28207941|(262144,[9726,626...|
|19174305|(262144,[2710,392...|
|18392317|(262144,[5213,606...|
|34420857|(262144,[11275,13...|
| 4039635|(262144,[571,1640...|
| 8034072|(262144,[991,4200...|
| 4016437|(262144,[5595,783...|
| 1520023|(262144,[14,535,5...|
|24589422|(262144,[1998,249...|
|35068740|(262144,[2710,484...|
|21132951|(262144,[1841,392...|
+--------+--------------------+
only showing top 20 rows



In [7]:
mapping = spark.read.load("./mapping.csv", format="csv",sep=",", inferschema="true", header="true")
genre_map = mapping.select("_c0", "0").rdd.collectAsMap()
genre_map

{0: 'Drama',
 1: 'Comedy',
 2: 'Romance Film',
 3: 'Thriller',
 4: 'Action',
 5: 'World cinema',
 6: 'Crime Fiction',
 7: 'Horror',
 8: 'Black-and-white',
 9: 'Indie',
 10: 'Action/Adventure',
 11: 'Adventure',
 12: 'Family Film',
 13: 'Short Film',
 14: 'Romantic drama',
 15: 'Animation',
 16: 'Musical',
 17: 'Science Fiction',
 18: 'Mystery',
 19: 'Romantic comedy'}

In [8]:
import pyspark.sql.functions as F

from pyspark.sql.types import *
def generateLabels(row, i): 
    label = 0
    for x in row[1:-1].split(","):
        if x.strip()[1:-1] == genre_map.get(i):
            label = 1
    return label
udfFunc = F.udf(generateLabels, IntegerType())
for i in range(len(genre_map)):    
    traindataframe = traindataframe.withColumn("label"+str(i), udfFunc("genre",F.lit(i)))
traindataframe.printSchema()

root
 |-- movie_id: long (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- plot: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filteredwords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rawFeatures: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- label0: integer (nullable = true)
 |-- label1: integer (nullable = true)
 |-- label2: integer (nullable = true)
 |-- label3: integer (nullable = true)
 |-- label4: integer (nullable = true)
 |-- label5: integer (nullable = true)
 |-- label6: integer (nullable = true)
 |-- label7: integer (nullable = true)
 |-- label8: integer (nullable = true)
 |-- label9: integer (nullable = true)
 |-- label10: integer (nullable = true)
 |-- label11: integer (nullable = true)
 |-- label12: integer (nullable = true)
 |-- label13: integer (nullable = true)
 |-- label14: integer (nullable =

In [9]:
def generateLabelCol(): 
    return ""
udfFunc = F.udf(generateLabelCol, StringType())

testdataframe = testdataframe.withColumn("predictions", udfFunc())
testdataframe.printSchema()

root
 |-- movie_id: long (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- plot: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filteredwords: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- rawFeatures: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- predictions: string (nullable = true)



In [10]:
# from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
# from pyspark.mllib.regression import LabeledPoint
# from pyspark.mllib.linalg import Vector as MLLibVector, Vectors as MLLibVectors

# def parsePoint(line):
#     return LabeledPoint(line.label, MLLibVectors.fromML(line.features))
# for i in range(len(genre_map)):
#     print('classifying Genre '+str(i+1)+ " : "+genre_map.get(i))
#     parsedData = traindataframe.selectExpr("label"+str(i)+" as label", "features").rdd.map(parsePoint)
#     model = LogisticRegressionWithLBFGS.train(parsedData)
#     model.save(spark,'./Models/Part2/model'+str(i))
#     labelsAndPreds = testdataframe.rdd.map(lambda p: (p.movie_id, model.predict(MLLibVectors.fromML(p.features))))
#     label_map = labelsAndPreds.collectAsMap()
#     def addLabel(m_id, row): 
#         row= row + str(label_map.get(m_id))+" "
#         return row
#     udfFunc = F.udf(addLabel, StringType())
#     testdataframe = testdataframe.withColumn("predictions", udfFunc("movie_id","predictions"))
    

In [11]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vector as MLLibVector, Vectors as MLLibVectors

def parsePoint(line):
    return LabeledPoint(line.label, MLLibVectors.fromML(line.features))
for i in range(len(genre_map)):
    print('classifying Genre '+str(i+1)+ " : "+genre_map.get(i))
    model = LogisticRegressionModel.load(spark,'./Models/Part2/model'+str(i))
    labelsAndPreds = testdataframe.rdd.map(lambda p: (p.movie_id, model.predict(MLLibVectors.fromML(p.features))))
    label_map = labelsAndPreds.collectAsMap()
    def addLabel(m_id, row): 
        row= row + str(label_map.get(m_id))+" "
        return row
    udfFunc = F.udf(addLabel, StringType())
    testdataframe = testdataframe.withColumn("predictions", udfFunc("movie_id","predictions"))
    

classifying Genre 1 : Drama
classifying Genre 2 : Comedy
classifying Genre 3 : Romance Film
classifying Genre 4 : Thriller
classifying Genre 5 : Action
classifying Genre 6 : World cinema
classifying Genre 7 : Crime Fiction
classifying Genre 8 : Horror
classifying Genre 9 : Black-and-white
classifying Genre 10 : Indie
classifying Genre 11 : Action/Adventure
classifying Genre 12 : Adventure
classifying Genre 13 : Family Film
classifying Genre 14 : Short Film
classifying Genre 15 : Romantic drama
classifying Genre 16 : Animation
classifying Genre 17 : Musical
classifying Genre 18 : Science Fiction
classifying Genre 19 : Mystery
classifying Genre 20 : Romantic comedy


In [12]:
testdataframe.select('movie_id','predictions').show(20, False)

+--------+----------------------------------------+
|movie_id|predictions                             |
+--------+----------------------------------------+
|1335380 |1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 |
|29062594|0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 |
|9252321 |1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 |
|13455076|0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 |
|24165951|0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 |
|1925869 |1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 |
|10799612|1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 |
|28238240|0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 |
|17124781|0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 |
|28207941|0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 |
|19174305|0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 |
|18392317|0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 |
|34420857|0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 |
|4039635 |0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 |
|8034072 |1 0 1 0 0 1 1 0 0 0 1 0 0 0 0 0 1 0 0 0 |
|4016437 |1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 |
|1520023 |1 

In [13]:
testdataframe.select('movie_id','predictions').write.format("csv").option("header", "true").mode("append").save("outputs/output2")