In [1]:
import findspark

In [2]:
findspark.init('spark-2.4.0-bin-hadoop2.7')

In [3]:
import pyspark

sc = pyspark.SparkContext()

In [4]:
from pyspark.sql import *

spark = SparkSession.builder.appName('Assignment 3').config('spark.some.config.option', 'some-value').config("spark.driver.memory", "15g").getOrCreate()

# Loading datasets

In [5]:
import pandas as pd

df1 = pd.read_csv('./train.csv')
df2 = pd.read_csv('./test.csv')

In [6]:
train = spark.createDataFrame(df1)
test = spark.createDataFrame(df2)

# Removing all the characters except alphabets and tokenizing plots

In [7]:
from pyspark.ml.feature import RegexTokenizer

regexTokenizer = RegexTokenizer(inputCol = 'plot', outputCol = 'clean_plot', pattern = "[^A-Za-z]")
regexTokenized_train = regexTokenizer.transform(train)
regexTokenized_test = regexTokenizer.transform(test)


# Removing stop-words

In [8]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol= 'clean_plot', outputCol = 'filtered_plot')
filtered_train = remover.transform(regexTokenized_train)
filtered_test = remover.transform(regexTokenized_test)

# Creating features: tf using CountVectorizer, tf-idf using HashingTF and IDF and for part 3 using Word2Vec

In [9]:
from pyspark.ml.feature import CountVectorizer, Word2Vec, IDF, HashingTF, CountVectorizerModel, IDFModel, Word2VecModel

loadModel = True # make this variable false in-order to train a new model instead of loading a trained one

if loadModel:
    tfModel = CountVectorizerModel.load('models/tf_model')
else:
    cv = CountVectorizer(inputCol = 'filtered_plot', outputCol = 'features_tf', minDF = 5.0, vocabSize = 10000)
    tfModel = cv.fit(filtered_train)
    tfModel.save('tf_model')

vectorized_train = tfModel.transform(filtered_train)
vectorized_test = tfModel.transform(filtered_test)

print('done tf')

hashingTF = HashingTF(inputCol = "filtered_plot", outputCol = "features_hash")
vectorized_train = hashingTF.transform(vectorized_train)
vectorized_test = hashingTF.transform(vectorized_test)

if loadModel:
    idfModel = IDFModel.load('models/idf_model')
else:
    idf = IDF(minDocFreq = 5, inputCol = "features_hash", outputCol = "features_tf-idf")
    idfModel = idf.fit(vectorized_train)
    idfModel.save('idf_model')

vectorized_train = idfModel.transform(vectorized_train)
vectorized_test = idfModel.transform(vectorized_test)

print('done tf-idf')

if loadModel:
    word2vecModel = Word2VecModel.load('models/word2vec_model')
else:
    word2Vec = Word2Vec(inputCol = "filtered_plot", outputCol = "features_word2vec", minCount = 20, vectorSize = 350)
    word2vecModel = word2Vec.fit(vectorized_train)
    word2vecModel.save('word2vec_model')

vectorized_train = word2vecModel.transform(vectorized_train)
vectorized_test = word2vecModel.transform(vectorized_test)
print('done word2vec')


done tf
done tf-idf
done word2vec


In [10]:
vectorized_train.printSchema()

root
 |-- movie_id: long (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- plot: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- clean_plot: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered_plot: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features_tf: vector (nullable = true)
 |-- features_hash: vector (nullable = true)
 |-- features_tf-idf: vector (nullable = true)
 |-- features_word2vec: vector (nullable = true)



# Converting the genre column from string to array of strings

In [11]:
from pyspark.sql.functions import col, regexp_replace, split

vectorized_train = vectorized_train.withColumn("genre1", split(regexp_replace(col("genre"), r"(^\[)|(\]$)|(')", ""), ", "))

In [12]:
vectorized_train.select('genre1').show(5, truncate = False)

+--------------------------------------------------+
|genre1                                            |
+--------------------------------------------------+
|[World cinema, Drama]                             |
|[Action/Adventure, Action, Science Fiction, Drama]|
|[Musical, Action, Drama]                          |
|[Comedy]                                          |
|[Crime Fiction, World cinema, Drama]              |
+--------------------------------------------------+
only showing top 5 rows



# 20 different UDFs for creating 20 columns for 20 different genre

In [13]:
from pyspark.sql.functions import udf

@udf('double')
def Drama(genres):
    if 'Drama' in genres:
        return 1.0
    else: return 0.0
    
@udf('double')
def Comedy(genres):
    if 'Comedy' in genres:
        return 1.0
    else: return 0.0
    
@udf('double')
def RomanceFilm(genres):
    if 'Romance Film' in genres:
        return 1.0
    else: return 0.0
    
@udf('double')
def Thriller(genres):
    if 'Thriller' in genres:
        return 1.0
    else: return 0.0
    
@udf('double')
def Action(genres):
    if 'Action' in genres:
        return 1.0
    else: return 0.0
    
@udf('double')
def WorldCinema(genres):
    if 'World cinema' in genres:
        return 1.0
    else: return 0.0
    
@udf('double')
def CrimeFiction(genres):
    if 'Crime Fiction' in genres:
        return 1.0
    else: return 0.0
    
@udf('double')
def Horror(genres):
    if 'Horror' in genres:
        return 1.0
    else: return 0.0
    
@udf('double')
def BlackWhite(genres):
    if 'Black-and-white' in genres:
        return 1.0
    else: return 0.0
    
@udf('double')
def Indie(genres):
    if 'Indie' in genres:
        return 1.0
    else: return 0.0
    
@udf('double')
def ActionAdventure(genres):
    if 'Action/Adventure' in genres:
        return 1.0
    else: return 0.0
    
@udf('double')
def Adventure(genres):
    if 'Adventure' in genres:
        return 1.0
    else: return 0.0
    
@udf('double')
def FamilyFilm(genres):
    if 'Family Film' in genres:
        return 1.0
    else: return 0.0
    
@udf('double')
def ShortFilm(genres):
    if 'Short Film' in genres:
        return 1.0
    else: return 0.0
    
@udf('double')
def RomanticDrama(genres):
    if 'Romantic drama' in genres:
        return 1.0
    else: return 0.0
    
@udf('double')
def Animation(genres):
    if 'Animation' in genres:
        return 1.0
    else: return 0.0
    
@udf('double')
def Musical(genres):
    if 'Musical' in genres:
        return 1.0
    else: return 0.0
    
@udf('double')
def ScienceFiction(genres):
    if 'Science Fiction' in genres:
        return 1.0
    else: return 0.0
    
@udf('double')
def Mystery(genres):
    if 'Mystery' in genres:
        return 1.0
    else: return 0.0
    
@udf('double')
def RomanticComedy(genres):
    if 'Romantic comedy' in genres:
        return 1.0
    else: return 0.0


# Creating 20 new columns for 20 genres

In [14]:
a = vectorized_train.withColumn("Drama", Drama("genre1"))
a = a.withColumn("Comedy", Comedy("genre1"))
a = a.withColumn("RomanceFilm", RomanceFilm("genre1"))
a = a.withColumn("Thriller", Thriller("genre1"))
a = a.withColumn("Action", Action("genre1"))
a = a.withColumn("WorldCinema", WorldCinema("genre1"))
a = a.withColumn("CrimeFiction", CrimeFiction("genre1"))
a = a.withColumn("Horror", Horror("genre1"))
a = a.withColumn("BlackWhite", BlackWhite("genre1"))
a = a.withColumn("Indie", Indie("genre1"))
a = a.withColumn("ActionAdventure", ActionAdventure("genre1"))
a = a.withColumn("Adventure", Adventure("genre1"))
a = a.withColumn("FamilyFilm", FamilyFilm("genre1"))
a = a.withColumn("ShortFilm", ShortFilm("genre1"))
a = a.withColumn("RomanticDrama", RomanticDrama("genre1"))
a = a.withColumn("Animation", Animation("genre1"))
a = a.withColumn("Musical", Musical("genre1"))
a = a.withColumn("ScienceFiction", ScienceFiction("genre1"))
a = a.withColumn("Mystery", Mystery("genre1"))
a = a.withColumn("RomanticComedy", RomanticComedy("genre1"))

In [15]:
a.printSchema()

root
 |-- movie_id: long (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- plot: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- clean_plot: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered_plot: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features_tf: vector (nullable = true)
 |-- features_hash: vector (nullable = true)
 |-- features_tf-idf: vector (nullable = true)
 |-- features_word2vec: vector (nullable = true)
 |-- genre1: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- Drama: double (nullable = true)
 |-- Comedy: double (nullable = true)
 |-- RomanceFilm: double (nullable = true)
 |-- Thriller: double (nullable = true)
 |-- Action: double (nullable = true)
 |-- WorldCinema: double (nullable = true)
 |-- CrimeFiction: double (nullable = true)
 |-- Horror: double (nullable = true)
 |-- BlackWhite: double (nullable = true)
 |-- Indie: double (nullable 

# Part 1: Logistic regression using tf as features

In [16]:
from pyspark.ml.classification import LogisticRegression, LinearSVC, LogisticRegressionModel, LinearSVCModel

genre_dict = {
        0 : 'Drama',
        1 : 'Comedy',
        2 : 'Romance Film',
        3 : 'Thriller',
        4 : 'Action',
        5 : 'World cinema',
        6 : 'Crime Fiction',
        7 : 'Horror',
        8 : 'Black-and-white',
        9 : 'Indie',
        10 : 'Action/Adventure',
        11 : 'Adventure',
        12 : 'Family Film',
        13 : 'Short Film',
        14 : 'Romantic drama',
        15 : 'Animation',
        16 : 'Musical',
        17 : 'Science Fiction',
        18 : 'Mystery',
        19 : 'Romantic comedy'
    }

label_dict = {
        0 : 'Drama',
        1 : 'Comedy',
        2 : 'RomanceFilm',
        3 : 'Thriller',
        4 : 'Action',
        5 : 'WorldCinema',
        6 : 'CrimeFiction',
        7 : 'Horror',
        8 : 'BlackWhite',
        9 : 'Indie',
        10 : 'ActionAdventure',
        11 : 'Adventure',
        12 : 'FamilyFilm',
        13 : 'ShortFilm',
        14 : 'RomanticDrama',
        15 : 'Animation',
        16 : 'Musical',
        17 : 'ScienceFiction',
        18 : 'Mystery',
        19 : 'RomanticComedy'
    }

columns_to_drop = ['rawPrediction', 'probability']
testData_tf = vectorized_test

for i in range(20):
    if loadModel:
        lrModel_tf = LogisticRegressionModel.load('models/lr_tf/lrModel_tf_' + label_dict[i])
    else:
        lr_tf = LogisticRegression(featuresCol = 'features_tf', labelCol = label_dict[i], predictionCol = 'pred_' + label_dict[i], maxIter = 20, regParam = 0.3)
        lrModel_tf = lr_tf.fit(a)
        lrModel_tf.save('lrModel_tf_' + label_dict[i])
    
    testData_tf = testData_tf.drop(*columns_to_drop)
    testData_tf = lrModel_tf.transform(testData_tf)

    print('Prediction done for %s genre' %(genre_dict[i]))

Prediction done for Drama genre
Prediction done for Comedy genre
Prediction done for Romance Film genre
Prediction done for Thriller genre
Prediction done for Action genre
Prediction done for World cinema genre
Prediction done for Crime Fiction genre
Prediction done for Horror genre
Prediction done for Black-and-white genre
Prediction done for Indie genre
Prediction done for Action/Adventure genre
Prediction done for Adventure genre
Prediction done for Family Film genre
Prediction done for Short Film genre
Prediction done for Romantic drama genre
Prediction done for Animation genre
Prediction done for Musical genre
Prediction done for Science Fiction genre
Prediction done for Mystery genre
Prediction done for Romantic comedy genre


# UDF for combining 20 predicted labels to 1 column

In [17]:
@udf('string')
def combineLables(*labels):
    temp = [str(int(i)) for i in labels]
    return ' '.join(temp)

In [18]:
testData_tf = testData_tf.withColumn("predictions", combineLables('pred_Drama', 'pred_Comedy', 'pred_RomanceFilm', 'pred_Thriller', 'pred_Action', 'pred_WorldCinema', 'pred_CrimeFiction', 'pred_Horror', 'pred_BlackWhite', 'pred_Indie', 'pred_ActionAdventure', 'pred_Adventure', 'pred_FamilyFilm', 'pred_ShortFilm', 'pred_RomanticDrama', 'pred_Animation', 'pred_Musical', 'pred_ScienceFiction', 'pred_Mystery', 'pred_RomanticComedy'))
testData_tf.select('predictions').head(10)

[Row(predictions='1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0'),
 Row(predictions='0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0')]

In [19]:
testData_tf.printSchema()

root
 |-- movie_id: long (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- plot: string (nullable = true)
 |-- clean_plot: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered_plot: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features_tf: vector (nullable = true)
 |-- features_hash: vector (nullable = true)
 |-- features_tf-idf: vector (nullable = true)
 |-- features_word2vec: vector (nullable = true)
 |-- pred_Drama: double (nullable = false)
 |-- pred_Comedy: double (nullable = false)
 |-- pred_RomanceFilm: double (nullable = false)
 |-- pred_Thriller: double (nullable = false)
 |-- pred_Action: double (nullable = false)
 |-- pred_WorldCinema: double (nullable = false)
 |-- pred_CrimeFiction: double (nullable = false)
 |-- pred_Horror: double (nullable = false)
 |-- pred_BlackWhite: double (nullable = false)
 |-- pred_Indie: double (nullable = false)
 |-- pred_ActionAdventure: double (nullable = false)
 

In [20]:
out_tf = testData_tf.select('movie_id', 'predictions')
out_tf.write.csv('out_tf', header = True)

# Part 2: Logistic regression using tf-idf as features

In [21]:
testData_idf = vectorized_test
for i in range(20):
    if loadModel:
        lrModel_idf = LogisticRegressionModel.load('models/lr_idf/lrModel_idf_' + label_dict[i])
    else:
        lr_idf = LogisticRegression(featuresCol = 'features_tf-idf', labelCol = label_dict[i], predictionCol = 'pred_' + label_dict[i], maxIter = 20, regParam = 0.3)
        lrModel_idf = lr_idf.fit(a)
        lrModel_idf.save('lrModel_idf_' + label_dict[i])
    
    testData_idf = testData_idf.drop(*columns_to_drop)
    testData_idf = lrModel_idf.transform(testData_idf)

    print('Prediction done for %s genre' %(genre_dict[i]))

Prediction done for Drama genre
Prediction done for Comedy genre
Prediction done for Romance Film genre
Prediction done for Thriller genre
Prediction done for Action genre
Prediction done for World cinema genre
Prediction done for Crime Fiction genre
Prediction done for Horror genre
Prediction done for Black-and-white genre
Prediction done for Indie genre
Prediction done for Action/Adventure genre
Prediction done for Adventure genre
Prediction done for Family Film genre
Prediction done for Short Film genre
Prediction done for Romantic drama genre
Prediction done for Animation genre
Prediction done for Musical genre
Prediction done for Science Fiction genre
Prediction done for Mystery genre
Prediction done for Romantic comedy genre


In [22]:
testData_idf = testData_idf.withColumn("predictions", combineLables('pred_Drama', 'pred_Comedy', 'pred_RomanceFilm', 'pred_Thriller', 'pred_Action', 'pred_WorldCinema', 'pred_CrimeFiction', 'pred_Horror', 'pred_BlackWhite', 'pred_Indie', 'pred_ActionAdventure', 'pred_Adventure', 'pred_FamilyFilm', 'pred_ShortFilm', 'pred_RomanticDrama', 'pred_Animation', 'pred_Musical', 'pred_ScienceFiction', 'pred_Mystery', 'pred_RomanticComedy'))
testData_idf.select('predictions').head(10)

[Row(predictions='1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0'),
 Row(predictions='0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0')]

In [23]:
out_idf = testData_idf.select('movie_id', 'predictions')
out_idf.write.csv('out_idf', header = True)

# Part 3: Logistic regression using word2vec as features

In [24]:
testData_word2vec = vectorized_test
for i in range(20):
    if loadModel:
        lrModel_word2vec = LogisticRegressionModel.load('models/lr_word2vec/lrModel_word2vec_' + label_dict[i])
    else:
        lr_word2vec = LogisticRegression(featuresCol = 'features_word2vec', labelCol = label_dict[i], predictionCol = 'pred_' + label_dict[i], maxIter = 20, regParam = 0.3)
        lrModel_word2vec = lr_word2vec.fit(a)
        lrModel_word2vec.save('lrModel_word2vec_' + label_dict[i])
    
    testData_word2vec = testData_word2vec.drop(*columns_to_drop)
    testData_word2vec = lrModel_word2vec.transform(testData_word2vec)
    
    print('Prediction done for %s genre' %(genre_dict[i]))

Prediction done for Drama genre
Prediction done for Comedy genre
Prediction done for Romance Film genre
Prediction done for Thriller genre
Prediction done for Action genre
Prediction done for World cinema genre
Prediction done for Crime Fiction genre
Prediction done for Horror genre
Prediction done for Black-and-white genre
Prediction done for Indie genre
Prediction done for Action/Adventure genre
Prediction done for Adventure genre
Prediction done for Family Film genre
Prediction done for Short Film genre
Prediction done for Romantic drama genre
Prediction done for Animation genre
Prediction done for Musical genre
Prediction done for Science Fiction genre
Prediction done for Mystery genre
Prediction done for Romantic comedy genre


In [25]:
testData_word2vec = testData_word2vec.withColumn("predictions", combineLables('pred_Drama', 'pred_Comedy', 'pred_RomanceFilm', 'pred_Thriller', 'pred_Action', 'pred_WorldCinema', 'pred_CrimeFiction', 'pred_Horror', 'pred_BlackWhite', 'pred_Indie', 'pred_ActionAdventure', 'pred_Adventure', 'pred_FamilyFilm', 'pred_ShortFilm', 'pred_RomanticDrama', 'pred_Animation', 'pred_Musical', 'pred_ScienceFiction', 'pred_Mystery', 'pred_RomanticComedy'))
testData_word2vec.select('predictions').head(10)

[Row(predictions='1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0'),
 Row(predictions='0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0')]

In [26]:
out_word2vec = testData_word2vec.select('movie_id', 'predictions')
out_word2vec.write.csv('out_word2vec', header = True)

# Part 4: SVM using tf-idf as features

In [27]:
testData_svm = vectorized_test
for i in range(20):
    if loadModel:
        svmModel = LinearSVCModel.load('models/svm_tf-idf/svmModel_' + label_dict[i])
    else:
        svm = LinearSVC(featuresCol = 'features_tf-idf', labelCol = label_dict[i], predictionCol = 'pred_' + label_dict[i], maxIter = 20, regParam = 0.3)
        svmModel = svm.fit(a)
        svmModel.save('svmModel_' + label_dict[i])
    
    testData_svm = testData_svm.drop(*columns_to_drop)
    testData_svm = svmModel.transform(testData_svm)
    
    print('Prediction done for %s genre' %(genre_dict[i]))

Prediction done for Drama genre
Prediction done for Comedy genre
Prediction done for Romance Film genre
Prediction done for Thriller genre
Prediction done for Action genre
Prediction done for World cinema genre
Prediction done for Crime Fiction genre
Prediction done for Horror genre
Prediction done for Black-and-white genre
Prediction done for Indie genre
Prediction done for Action/Adventure genre
Prediction done for Adventure genre
Prediction done for Family Film genre
Prediction done for Short Film genre
Prediction done for Romantic drama genre
Prediction done for Animation genre
Prediction done for Musical genre
Prediction done for Science Fiction genre
Prediction done for Mystery genre
Prediction done for Romantic comedy genre


In [28]:
testData_svm = testData_svm.withColumn("predictions", combineLables('pred_Drama', 'pred_Comedy', 'pred_RomanceFilm', 'pred_Thriller', 'pred_Action', 'pred_WorldCinema', 'pred_CrimeFiction', 'pred_Horror', 'pred_BlackWhite', 'pred_Indie', 'pred_ActionAdventure', 'pred_Adventure', 'pred_FamilyFilm', 'pred_ShortFilm', 'pred_RomanticDrama', 'pred_Animation', 'pred_Musical', 'pred_ScienceFiction', 'pred_Mystery', 'pred_RomanticComedy'))
testData_svm.select('predictions').head(10)

[Row(predictions='1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'),
 Row(predictions='0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0'),
 Row(predictions='0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0')]

In [29]:
out_svm = testData_svm.select('movie_id', 'predictions')
out_svm.write.csv('out_svm', header = True)