In [2]:
import findspark
import numpy as np
from nltk.stem.snowball import SnowballStemmer

In [3]:
findspark.init('/home/cse587/spark-2.4.0-bin-hadoop2.7')

In [4]:
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
import pyspark.sql.types as tp
from pyspark.sql.functions import *
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, CountVectorizer

spark = SparkSession \
        .builder \
        .appName("Data preprocessing") \
        .config("spark.some.config.option","some-value") \
        .getOrCreate()
dataframe = spark.read.csv("/home/cse587/Downloads/diccsvs/train.csv", escape ="\"", inferSchema = True, header = True)
dataframe = dataframe.na.drop(subset=["genre","plot","movie_id"])
dataframe.printSchema()
df_mapping = spark.read.csv("/home/cse587/Downloads/diccsvs/mapping.csv", escape ="\"", inferSchema = True, header = True)


root
 |-- movie_id: string (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- plot: string (nullable = true)
 |-- genre: string (nullable = true)



In [5]:
from pyspark.ml.feature import IDF
from pyspark.ml import Pipeline

In [6]:
#clean text
df_clean = dataframe.select('movie_id', 'movie_name', (lower(regexp_replace('plot',"[^a-zA-Z\\s]","")).alias('plot')), (lower(regexp_replace("genre","[^a-zA-Z\-/,\\s]","")).alias("genre")))

In [7]:
def replacelabel(x):
    test = x.split(", ")
    num_label = []
    if(len(test)<1):
        return num_label
    for label in test:
        if label == 'drama':
            num_label.append(0)
        elif label == 'comedy':
            num_label.append(1)
        elif label == 'romance film':
            num_label.append(2)
        elif label ==  'thriller':
            num_label.append(3)
        elif label == 'action': 
            num_label.append(4)
        elif label == 'world cinema':
            num_label.append(5)
        elif label == 'crime fiction':
            num_label.append(6)
        elif label == 'horror':
            num_label.append(7)
        elif label == 'black-and-white':
            num_label.append(8)
        elif label == 'indie':
            num_label.append(9)
        elif label == 'action/adventure':
            num_label.append(10)
        elif label == 'adventure':
            num_label.append(11)
        elif label == 'family film':
            num_label.append(12)
        elif label == 'short film':
            num_label.append(13)
        elif label == 'romantic drama':
            num_label.append(14)
        elif label == 'animation':
            num_label.append(15)
        elif label == 'musical':
            num_label.append(16)
        elif label == 'science fiction':
            num_label.append(17)
        elif label == 'mystery':
            num_label.append(18)
        elif label == 'romantic comedy':
            num_label.append(19)
    return num_label

In [8]:
label_udf = udf(replacelabel, ArrayType(IntegerType()))
df_clean = df_clean.withColumn('genre_value',label_udf(df_clean.genre))

In [9]:
df_clean.printSchema()

root
 |-- movie_id: string (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- plot: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- genre_value: array (nullable = true)
 |    |-- element: integer (containsNull = true)



In [10]:
#Tokenize Plot Text
tokenizer = Tokenizer(inputCol = 'plot', outputCol = 'plot_token')
df_words_token = tokenizer.transform(df_clean).select("movie_id","movie_name","plot_token","genre","genre_value")

In [11]:
#Remove StopWords
remover = StopWordsRemover(inputCol = 'plot_token', outputCol = 'plot_clean')
df_words_token_rem_stopwor = remover.transform(df_words_token).select("movie_id","movie_name","plot_clean","genre","genre_value")

In [12]:
#Text Stemming
stemmer = SnowballStemmer(language='english')
stem_udf = udf(lambda tokens : [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
df_stemmed = df_words_token_rem_stopwor.withColumn("words_stemmed" ,stem_udf("plot_clean")).select('movie_id',"words_stemmed","genre","genre_value")
df_stemmed = df_stemmed.withColumnRenamed("words_stemmed","plot")
df_stemmed = df_stemmed.withColumnRenamed("genre_value","label")

In [13]:
df_stemmed = df_stemmed.withColumn("col_0",lit(0))
df_stemmed = df_stemmed.withColumn("col_1",lit(0))
df_stemmed = df_stemmed.withColumn("col_2",lit(0))
df_stemmed = df_stemmed.withColumn("col_3",lit(0))
df_stemmed = df_stemmed.withColumn("col_4",lit(0))
df_stemmed = df_stemmed.withColumn("col_5",lit(0))
df_stemmed = df_stemmed.withColumn("col_6",lit(0))
df_stemmed = df_stemmed.withColumn("col_7",lit(0))
df_stemmed = df_stemmed.withColumn("col_8",lit(0))
df_stemmed = df_stemmed.withColumn("col_9",lit(0))
df_stemmed = df_stemmed.withColumn("col_10",lit(0))
df_stemmed = df_stemmed.withColumn("col_11",lit(0))
df_stemmed = df_stemmed.withColumn("col_12",lit(0))
df_stemmed = df_stemmed.withColumn("col_13",lit(0))
df_stemmed = df_stemmed.withColumn("col_14",lit(0))
df_stemmed = df_stemmed.withColumn("col_15",lit(0))
df_stemmed = df_stemmed.withColumn("col_16",lit(0))
df_stemmed = df_stemmed.withColumn("col_17",lit(0))
df_stemmed = df_stemmed.withColumn("col_18",lit(0))
df_stemmed = df_stemmed.withColumn("col_19",lit(0))

In [14]:
my_schema = tp.StructType([
    tp.StructField(name='movie_id', dataType=tp.StringType(), nullable=True),
    tp.StructField(name='plot', dataType=tp.ArrayType(StringType()), nullable=True),
    tp.StructField(name='genre', dataType=tp.StringType(), nullable=True),
    tp.StructField(name='label', dataType=tp.ArrayType(IntegerType()), nullable=True),
    tp.StructField(name='col_0', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_1', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_2', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_3', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_4', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_5', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_6', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_7', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_8', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_9', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_10', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_11', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_12', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_13', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_14', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_15', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_16', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_17', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_18', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_19', dataType=tp.IntegerType(), nullable=True),
])

In [15]:
result_df = df_stemmed.select("*").toPandas()
for index, row in result_df.iterrows():
    label_arr = row['label']
    for i in label_arr:
        if i == 0:
            result_df.loc[index, "col_0"] = 1
        if i == 1:
            result_df.loc[index, "col_1"] = 1
        if i == 2:
            result_df.loc[index, "col_2"] = 1
        if i == 3:
            result_df.loc[index, "col_3"] = 1
        if i == 4:
            result_df.loc[index, "col_4"] = 1
        if i == 5:
            result_df.loc[index, "col_5"] = 1
        if i == 6:
            result_df.loc[index, "col_6"] = 1
        if i == 7:
            result_df.loc[index, "col_7"] = 1
        if i == 8:
            result_df.loc[index, "col_8"] = 1
        if i == 9:
            result_df.loc[index, "col_9"] = 1
        if i == 10:
            result_df.loc[index, "col_10"] = 1
        if i == 11:
            result_df.loc[index, "col_11"] = 1
        if i == 12:
            result_df.loc[index, "col_12"] = 1
        if i == 13:
            result_df.loc[index, "col_13"] = 1
        if i == 14:
            result_df.loc[index, "col_14"] = 1
        if i == 15:
            result_df.loc[index, "col_15"] = 1
        if i == 16:
            result_df.loc[index, "col_16"] = 1
        if i == 17:
            result_df.loc[index, "col_17"] = 1
        if i == 18:
            result_df.loc[index, "col_18"] = 1
        if i == 19:
            result_df.loc[index, "col_19"] = 1

In [16]:
df_stemmed = spark.createDataFrame(result_df, schema = my_schema)

In [17]:
df_stemmed.head(1)

[Row(movie_id='23890098', plot=['shlykov', 'hardwork', 'taxi', 'driver', 'lyosha', 'saxophonist', 'develop', 'bizarr', 'loveh', 'relationship', 'despit', 'prejudic', 'realiz', 'arent', 'differ'], genre='world cinema, drama', label=[5, 0], col_0=1, col_1=0, col_2=0, col_3=0, col_4=0, col_5=1, col_6=0, col_7=0, col_8=0, col_9=0, col_10=0, col_11=0, col_12=0, col_13=0, col_14=0, col_15=0, col_16=0, col_17=0, col_18=0, col_19=0)]

In [18]:
#filter words whose length is greater than 0
filter_length_udf = udf(lambda row: [x for x in row if len(x) > 0], ArrayType(StringType()))
df_stemmed = df_stemmed.withColumn('plot_length', filter_length_udf(col('plot')))
data = df_stemmed.select("*")
data.head(1)

[Row(movie_id='23890098', plot=['shlykov', 'hardwork', 'taxi', 'driver', 'lyosha', 'saxophonist', 'develop', 'bizarr', 'loveh', 'relationship', 'despit', 'prejudic', 'realiz', 'arent', 'differ'], genre='world cinema, drama', label=[5, 0], col_0=1, col_1=0, col_2=0, col_3=0, col_4=0, col_5=1, col_6=0, col_7=0, col_8=0, col_9=0, col_10=0, col_11=0, col_12=0, col_13=0, col_14=0, col_15=0, col_16=0, col_17=0, col_18=0, col_19=0, plot_length=['shlykov', 'hardwork', 'taxi', 'driver', 'lyosha', 'saxophonist', 'develop', 'bizarr', 'loveh', 'relationship', 'despit', 'prejudic', 'realiz', 'arent', 'differ'])]

In [19]:
TF = HashingTF(inputCol = 'plot_length', outputCol="rawFeatures")
idf = IDF(inputCol = 'rawFeatures', outputCol="features", minDocFreq=1200)
pipeline = Pipeline(stages=[TF, idf])
pipefit = pipeline.fit(data)
trainingData = pipefit.transform(data)

In [20]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, OneVsRest
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [21]:
lr = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_0', fitIntercept = True)
lrModel = lr.fit(trainingData)
print(lrModel.summary.accuracy)

lr1 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_1', fitIntercept = True)
lrModel1 = lr1.fit(trainingData)
print(lrModel1.summary.accuracy)

lr2 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_2', fitIntercept = True)
lrModel2 = lr2.fit(trainingData)
print(lrModel2.summary.accuracy)

lr3 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_3', fitIntercept = True)
lrModel3 = lr3.fit(trainingData)
print(lrModel3.summary.accuracy)

lr4 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_4', fitIntercept = True)
lrModel4 = lr4.fit(trainingData)
print(lrModel4.summary.accuracy)

lr5 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_5', fitIntercept = True)
lrModel5 = lr5.fit(trainingData)
print(lrModel5.summary.accuracy)

lr6 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_6', fitIntercept = True)
lrModel6 = lr6.fit(trainingData)
print(lrModel6.summary.accuracy)

lr7 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_7', fitIntercept = True)
lrModel7 = lr7.fit(trainingData)
print(lrModel7.summary.accuracy)

lr8 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_8', fitIntercept = True)
lrModel8 = lr8.fit(trainingData)
print(lrModel8.summary.accuracy)

lr9 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_9', fitIntercept = True)
lrModel9 = lr9.fit(trainingData)
print(lrModel9.summary.accuracy)

lr10 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_10', fitIntercept = True)
lrModel10 = lr10.fit(trainingData)
print(lrModel10.summary.accuracy)

lr11 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_11', fitIntercept = True)
lrModel11 = lr11.fit(trainingData)
print(lrModel11.summary.accuracy)

lr12 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_12', fitIntercept = True)
lrModel12 = lr12.fit(trainingData)
print(lrModel12.summary.accuracy)

lr13 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_13', fitIntercept = True)
lrModel13 = lr13.fit(trainingData)
print(lrModel13.summary.accuracy)

lr14 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_14', fitIntercept = True)
lrModel14 = lr14.fit(trainingData)
print(lrModel14.summary.accuracy)

lr15 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_15', fitIntercept = True)
lrModel15 = lr15.fit(trainingData)
print(lrModel15.summary.accuracy)

lr16 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_16', fitIntercept = True)
lrModel16 = lr16.fit(trainingData)
print(lrModel16.summary.accuracy)

lr17 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_17', fitIntercept = True)
lrModel17 = lr17.fit(trainingData)
print(lrModel17.summary.accuracy)

lr18 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_18', fitIntercept = True)
lrModel18 = lr18.fit(trainingData)
print(lrModel18.summary.accuracy)

lr19 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_19', fitIntercept = True)
lrModel19 = lr19.fit(trainingData)
print(lrModel19.summary.accuracy)

0.7009547076408756
0.7749525860683404
0.8517792278761773
0.8576296248674017
0.8732842585746889
0.8690411135041306
0.9029219839917708
0.9279951139541611
0.9053971519495966
0.906136487833103
0.9182230222765116
0.9272557780706548
0.9306631521424669
0.9390530071683436
0.9350991674435051
0.950110900382526
0.9422675110096757
0.9544504805683243
0.9511716866501655
0.9508823813044457


In [22]:
test_df = spark.read.csv("/home/cse587/Downloads/diccsvs/test.csv", escape ="\"", inferSchema = True, header = True)
test_df = test_df.na.drop(subset=["plot","movie_id"])
test_df.printSchema()

#clean text
test_df_clean = test_df.select('movie_id', 'movie_name', (lower(regexp_replace('plot',"[^a-zA-Z\\s]","")).alias('plot')))

#Tokenize Plot Text
test_df_words_token = tokenizer.transform(test_df_clean).select("movie_id","movie_name","plot_token")

#Remove StopWords
test_df_words_token_rem_stopwor = remover.transform(test_df_words_token).select("movie_id","movie_name","plot_clean")

root
 |-- movie_id: integer (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- plot: string (nullable = true)



In [23]:

#Text Stemming
test_df_stemmed = test_df_words_token_rem_stopwor.withColumn("words_stemmed" ,stem_udf("plot_clean")).select('movie_id',"words_stemmed")
test_df_stemmed = test_df_stemmed.withColumnRenamed("words_stemmed","plot")

test_df_stemmed = test_df_stemmed.withColumn('plot_length', filter_length_udf(col('plot')))
test_data = test_df_stemmed.select("movie_id","plot","plot_length")
#test_data.select('plot','plot_length').head(1)
test_data = test_df_stemmed.select("movie_id","plot","plot_length")
#TF-IDF
test_cv_result = pipefit.transform(test_data)

In [24]:
test_cv_result.head(1)

[Row(movie_id=1335380, plot=['film', 'base', 'event', 'happen', 'ship', 'exodus', '', 'well', 'event', 'deal', 'found', 'state', 'israel', '', 'nurs', 'katherin', 'kitti', 'fremont', '', 'american', 'volunt', 'karaolo', 'intern', 'camp', 'cyprus', 'thousand', 'jew', '', 'holocaust', 'survivor', '', 'held', 'british', 'wont', 'let', 'go', 'palestin', 'anxious', 'wait', 'day', 'liber', 'ari', 'ben', 'canaan', '', 'hagannah', 'rebel', 'previous', 'captain', 'jewish', 'brigad', 'british', 'armi', 'second', 'world', 'war', 'obtain', 'cargo', 'ship', 'smuggl', '', 'jewish', 'inmat', 'camp', 'illeg', 'voyag', 'mandat', 'palestin', 'discov', 'militari', 'author', 'british', 'find', 'refuge', 'ship', 'harbor', 'famagusta', 'blockad', 'refuge', 'stage', 'hunger', 'strike', 'camp', 'doctor', 'die', 'ari', 'threaten', 'blow', 'ship', 'refuge', 'british', 'relent', 'allow', 'exodus', 'safe', 'passag', 'meanwhil', 'kitti', 'grown', 'fond', 'karen', 'hansen', '', 'young', 'danishjewish', 'girl', 'sea

In [None]:
# >>>>>>MODEL LOADING<<<<<<
'''from pyspark.ml.classification import LogisticRegressionModel

savePath = './Part2/LGModel2'
lrModel = LogisticRegressionModel.load(savePath + '-1')
lrModel2 = LogisticRegressionModel.load(savePath + '-2')
lrModel3 = LogisticRegressionModel.load(savePath + '-3')
lrModel4 = LogisticRegressionModel.load(savePath + '-4')
lrModel5 = LogisticRegressionModel.load(savePath + '-5')
lrModel6 = LogisticRegressionModel.load(savePath + '-6')
lrModel7 = LogisticRegressionModel.load(savePath + '-7')
lrModel8 = LogisticRegressionModel.load(savePath + '-8')
lrModel9 = LogisticRegressionModel.load(savePath + '-9')
lrModel10 = LogisticRegressionModel.load(savePath + '-10')
lrModel11 = LogisticRegressionModel.load(savePath + '-11')
lrModel12 = LogisticRegressionModel.load(savePath + '-12')
lrModel13 = LogisticRegressionModel.load(savePath + '-13')
lrModel14 = LogisticRegressionModel.load(savePath + '-14')
lrModel15 = LogisticRegressionModel.load(savePath + '-15')
lrModel16 = LogisticRegressionModel.load(savePath + '-16')
lrModel17 = LogisticRegressionModel.load(savePath + '-17')
lrModel18 = LogisticRegressionModel.load(savePath + '-18')
lrModel19 = LogisticRegressionModel.load(savePath + '-19')'''

In [25]:
predictions = lrModel.transform(test_cv_result)
predictions1 = lrModel1.transform(test_cv_result)
predictions2 = lrModel2.transform(test_cv_result)
predictions3 = lrModel3.transform(test_cv_result)
predictions4 = lrModel4.transform(test_cv_result)
predictions5 = lrModel5.transform(test_cv_result)
predictions6 = lrModel6.transform(test_cv_result)
predictions7 = lrModel7.transform(test_cv_result)
predictions8 = lrModel8.transform(test_cv_result)
predictions9 = lrModel9.transform(test_cv_result)
predictions10 = lrModel10.transform(test_cv_result)
predictions11 = lrModel11.transform(test_cv_result)
predictions12 = lrModel12.transform(test_cv_result)
predictions13 = lrModel13.transform(test_cv_result)
predictions14 = lrModel14.transform(test_cv_result)
predictions15 = lrModel15.transform(test_cv_result)
predictions16 = lrModel16.transform(test_cv_result)
predictions17 = lrModel17.transform(test_cv_result)
predictions18 = lrModel18.transform(test_cv_result)
predictions19 = lrModel19.transform(test_cv_result)

In [26]:
predictions12.select('prediction').head(1)

[Row(prediction=0.0)]

In [27]:
from pyspark.sql import functions as F

In [28]:
dict = {}
movie_id = predictions.select(F.collect_list('movie_id')).first()[0]
pred1 = predictions.select(F.collect_list('prediction')).first()[0]
pred2 = predictions1.select(F.collect_list('prediction')).first()[0]
pred3 = predictions2.select(F.collect_list('prediction')).first()[0]
pred4 = predictions3.select(F.collect_list('prediction')).first()[0]
pred5 = predictions4.select(F.collect_list('prediction')).first()[0]
pred6 = predictions5.select(F.collect_list('prediction')).first()[0]
pred7 = predictions6.select(F.collect_list('prediction')).first()[0]
print("intermediate testing")
pred8 = predictions7.select(F.collect_list('prediction')).first()[0]
pred9 = predictions8.select(F.collect_list('prediction')).first()[0]
pred10 = predictions9.select(F.collect_list('prediction')).first()[0]
pred11 = predictions10.select(F.collect_list('prediction')).first()[0]
pred12 = predictions11.select(F.collect_list('prediction')).first()[0]
pred13 = predictions12.select(F.collect_list('prediction')).first()[0]
pred14 = predictions13.select(F.collect_list('prediction')).first()[0]
pred15 = predictions14.select(F.collect_list('prediction')).first()[0]
pred16 = predictions15.select(F.collect_list('prediction')).first()[0]
pred17 = predictions16.select(F.collect_list('prediction')).first()[0]
pred18 = predictions17.select(F.collect_list('prediction')).first()[0]
pred19 = predictions18.select(F.collect_list('prediction')).first()[0]
pred20 = predictions19.select(F.collect_list('prediction')).first()[0]

intermediate testing


In [29]:
from csv import writer

In [30]:
def append_list_as_row(filename, elements):
    with open(filename, 'a+', newline='') as write_obj:
        csv_writer = writer(write_obj)
        csv_writer.writerow(elements)

In [31]:
for i in range(0,len(pred1)):
    p = ""
    p+=str(int(pred1[i]))
    p+=" "+str(int(pred2[i]))
    p+=" "+str(int(pred3[i]))
    p+=" "+str(int(pred4[i]))
    p+=" "+str(int(pred5[i]))
    p+=" "+str(int(pred6[i]))
    p+=" "+str(int(pred7[i]))
    p+=" "+str(int(pred8[i]))
    p+=" "+str(int(pred9[i]))
    p+=" "+str(int(pred10[i]))
    p+=" "+str(int(pred11[i]))
    p+=" "+str(int(pred12[i]))
    p+=" "+str(int(pred13[i]))
    p+=" "+str(int(pred14[i]))
    p+=" "+str(int(pred15[i]))
    p+=" "+str(int(pred16[i]))
    p+=" "+str(int(pred17[i]))
    p+=" "+str(int(pred18[i]))
    p+=" "+str(int(pred19[i]))
    p+=" "+str(int(pred20[i]))
    dict[movie_id[i]] = p
    row_contents= [movie_id[i], p]
    append_list_as_row("/home/cse587/DICNEWFinal/output2.csv", row_contents)

In [32]:
lrModel.save("/home/cse587/DICNEWFinal/LGModel2-1/")
lrModel1.save("/home/cse587/DICNEWFinal/LGModel2-2/")
lrModel2.save("/home/cse587/DICNEWFinal/LGModel2-3/")
lrModel3.save("/home/cse587/DICNEWFinal/LGModel2-4/")
lrModel4.save("/home/cse587/DICNEWFinal/LGModel2-5/")
lrModel5.save("/home/cse587/DICNEWFinal/LGModel2-6/")
lrModel6.save("/home/cse587/DICNEWFinal/LGModel2-7/")
lrModel7.save("/home/cse587/DICNEWFinal/LGModel2-8/")
lrModel8.save("/home/cse587/DICNEWFinal/LGModel2-9/")
lrModel9.save("/home/cse587/DICNEWFinal/LGModel2-10/")
lrModel10.save("/home/cse587/DICNEWFinal/LGModel2-11/")
lrModel11.save("/home/cse587/DICNEWFinal/LGModel2-12/")
lrModel12.save("/home/cse587/DICNEWFinal/LGModel2-13/")
lrModel13.save("/home/cse587/DICNEWFinal/LGModel2-14/")
lrModel14.save("/home/cse587/DICNEWFinal/LGModel2-15/")
lrModel15.save("/home/cse587/DICNEWFinal/LGModel2-16/")
lrModel16.save("/home/cse587/DICNEWFinal/LGModel2-17/")
lrModel17.save("/home/cse587/DICNEWFinal/LGModel2-18/")
lrModel18.save("/home/cse587/DICNEWFinal/LGModel2-19/")
lrModel19.save("/home/cse587/DICNEWFinal/LGModel2-20/")