In [1]:
import findspark
import numpy as np
import pandas as pd
from nltk.stem.snowball import SnowballStemmer

In [2]:
findspark.init('/home/cse587/spark-2.4.0-bin-hadoop2.7')

In [3]:
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
import pyspark.sql.types as tp
from pyspark.sql.functions import *
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, CountVectorizer

spark = SparkSession \
        .builder \
        .appName("Data preprocessing") \
        .config("spark.some.config.option","some-value") \
        .getOrCreate()
dataframe = spark.read.csv("/home/cse587/Downloads/diccsvs/train.csv", escape ="\"", inferSchema = True, header = True)
dataframe = dataframe.na.drop(subset=["genre","plot","movie_id"])
dataframe.printSchema()
df_mapping = spark.read.csv("/home/cse587/Downloads/diccsvs/mapping.csv", escape ="\"", inferSchema = True, header = True)


root
 |-- movie_id: string (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- plot: string (nullable = true)
 |-- genre: string (nullable = true)



In [38]:
from pyspark.ml.feature import IDF
from pyspark.ml import Pipeline

In [5]:
#clean text
df_clean = dataframe.select('movie_id', 'movie_name', (lower(regexp_replace('plot',"[^a-zA-Z\\s]","")).alias('plot')), (lower(regexp_replace("genre","[^a-zA-Z\-/,\\s]","")).alias("genre")))

In [6]:
def replacelabel(x):
    test = x.split(", ")
    num_label = []
    if(len(test)<1):
        return num_label
    for label in test:
        if label == 'drama':
            num_label.append(0)
        elif label == 'comedy':
            num_label.append(1)
        elif label == 'romance film':
            num_label.append(2)
        elif label ==  'thriller':
            num_label.append(3)
        elif label == 'action': 
            num_label.append(4)
        elif label == 'world cinema':
            num_label.append(5)
        elif label == 'crime fiction':
            num_label.append(6)
        elif label == 'horror':
            num_label.append(7)
        elif label == 'black-and-white':
            num_label.append(8)
        elif label == 'indie':
            num_label.append(9)
        elif label == 'action/adventure':
            num_label.append(10)
        elif label == 'adventure':
            num_label.append(11)
        elif label == 'family film':
            num_label.append(12)
        elif label == 'short film':
            num_label.append(13)
        elif label == 'romantic drama':
            num_label.append(14)
        elif label == 'animation':
            num_label.append(15)
        elif label == 'musical':
            num_label.append(16)
        elif label == 'science fiction':
            num_label.append(17)
        elif label == 'mystery':
            num_label.append(18)
        elif label == 'romantic comedy':
            num_label.append(19)
    return num_label

In [7]:
label_udf = udf(replacelabel, ArrayType(IntegerType()))
df_clean = df_clean.withColumn('genre_value',label_udf(df_clean.genre))

In [8]:
df_clean.printSchema()

root
 |-- movie_id: string (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- plot: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- genre_value: array (nullable = true)
 |    |-- element: integer (containsNull = true)



In [9]:
#Tokenize Plot Text
tokenizer = Tokenizer(inputCol = 'plot', outputCol = 'plot_token')
df_words_token = tokenizer.transform(df_clean).select("movie_id","movie_name","plot_token","genre","genre_value")

In [10]:
#Remove StopWords
remover = StopWordsRemover(inputCol = 'plot_token', outputCol = 'plot_clean')
df_words_token_rem_stopwor = remover.transform(df_words_token).select("movie_id","movie_name","plot_clean","genre","genre_value")

In [11]:
#Text Stemming
stemmer = SnowballStemmer(language='english')
stem_udf = udf(lambda tokens : [stemmer.stem(token) for token in tokens], ArrayType(StringType()))
df_stemmed = df_words_token_rem_stopwor.withColumn("words_stemmed" ,stem_udf("plot_clean")).select('movie_id',"words_stemmed","genre","genre_value")
df_stemmed = df_stemmed.withColumnRenamed("words_stemmed","plot")
df_stemmed = df_stemmed.withColumnRenamed("genre_value","label")

In [12]:
df_stemmed = df_stemmed.withColumn("col_0",lit(0))
df_stemmed = df_stemmed.withColumn("col_1",lit(0))
df_stemmed = df_stemmed.withColumn("col_2",lit(0))
df_stemmed = df_stemmed.withColumn("col_3",lit(0))
df_stemmed = df_stemmed.withColumn("col_4",lit(0))
df_stemmed = df_stemmed.withColumn("col_5",lit(0))
df_stemmed = df_stemmed.withColumn("col_6",lit(0))
df_stemmed = df_stemmed.withColumn("col_7",lit(0))
df_stemmed = df_stemmed.withColumn("col_8",lit(0))
df_stemmed = df_stemmed.withColumn("col_9",lit(0))
df_stemmed = df_stemmed.withColumn("col_10",lit(0))
df_stemmed = df_stemmed.withColumn("col_11",lit(0))
df_stemmed = df_stemmed.withColumn("col_12",lit(0))
df_stemmed = df_stemmed.withColumn("col_13",lit(0))
df_stemmed = df_stemmed.withColumn("col_14",lit(0))
df_stemmed = df_stemmed.withColumn("col_15",lit(0))
df_stemmed = df_stemmed.withColumn("col_16",lit(0))
df_stemmed = df_stemmed.withColumn("col_17",lit(0))
df_stemmed = df_stemmed.withColumn("col_18",lit(0))
df_stemmed = df_stemmed.withColumn("col_19",lit(0))

In [13]:
my_schema = tp.StructType([
    tp.StructField(name='movie_id', dataType=tp.StringType(), nullable=True),
    tp.StructField(name='plot', dataType=tp.ArrayType(StringType()), nullable=True),
    tp.StructField(name='genre', dataType=tp.StringType(), nullable=True),
    tp.StructField(name='label', dataType=tp.ArrayType(IntegerType()), nullable=True),
    tp.StructField(name='col_0', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_1', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_2', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_3', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_4', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_5', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_6', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_7', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_8', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_9', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_10', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_11', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_12', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_13', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_14', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_15', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_16', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_17', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_18', dataType=tp.IntegerType(), nullable=True),
    tp.StructField(name='col_19', dataType=tp.IntegerType(), nullable=True),
])

In [14]:
result_df = df_stemmed.select("*").toPandas()
for index, row in result_df.iterrows():
    label_arr = row['label']
    for i in label_arr:
        if i == 0:
            result_df.loc[index, "col_0"] = 1
        if i == 1:
            result_df.loc[index, "col_1"] = 1
        if i == 2:
            result_df.loc[index, "col_2"] = 1
        if i == 3:
            result_df.loc[index, "col_3"] = 1
        if i == 4:
            result_df.loc[index, "col_4"] = 1
        if i == 5:
            result_df.loc[index, "col_5"] = 1
        if i == 6:
            result_df.loc[index, "col_6"] = 1
        if i == 7:
            result_df.loc[index, "col_7"] = 1
        if i == 8:
            result_df.loc[index, "col_8"] = 1
        if i == 9:
            result_df.loc[index, "col_9"] = 1
        if i == 10:
            result_df.loc[index, "col_10"] = 1
        if i == 11:
            result_df.loc[index, "col_11"] = 1
        if i == 12:
            result_df.loc[index, "col_12"] = 1
        if i == 13:
            result_df.loc[index, "col_13"] = 1
        if i == 14:
            result_df.loc[index, "col_14"] = 1
        if i == 15:
            result_df.loc[index, "col_15"] = 1
        if i == 16:
            result_df.loc[index, "col_16"] = 1
        if i == 17:
            result_df.loc[index, "col_17"] = 1
        if i == 18:
            result_df.loc[index, "col_18"] = 1
        if i == 19:
            result_df.loc[index, "col_19"] = 1

In [15]:
df_stemmed = spark.createDataFrame(result_df, schema = my_schema)

In [16]:
df_stemmed.head(1)

[Row(movie_id='23890098', plot=['shlykov', 'hardwork', 'taxi', 'driver', 'lyosha', 'saxophonist', 'develop', 'bizarr', 'loveh', 'relationship', 'despit', 'prejudic', 'realiz', 'arent', 'differ'], genre='world cinema, drama', label=[5, 0], col_0=1, col_1=0, col_2=0, col_3=0, col_4=0, col_5=1, col_6=0, col_7=0, col_8=0, col_9=0, col_10=0, col_11=0, col_12=0, col_13=0, col_14=0, col_15=0, col_16=0, col_17=0, col_18=0, col_19=0)]

In [55]:
#filter words whose length is greater than 3
filter_length_udf = udf(lambda row: [x for x in row if len(x) > 3], ArrayType(StringType()))
df_stemmed = df_stemmed.withColumn('plot_length', filter_length_udf(col('plot')))
data = df_stemmed.select("*")
data.head(1)

[Row(movie_id='23890098', plot=['shlykov', 'hardwork', 'taxi', 'driver', 'lyosha', 'saxophonist', 'develop', 'bizarr', 'loveh', 'relationship', 'despit', 'prejudic', 'realiz', 'arent', 'differ'], genre='world cinema, drama', label=[5, 0], col_0=1, col_1=0, col_2=0, col_3=0, col_4=0, col_5=1, col_6=0, col_7=0, col_8=0, col_9=0, col_10=0, col_11=0, col_12=0, col_13=0, col_14=0, col_15=0, col_16=0, col_17=0, col_18=0, col_19=0, plot_length=['shlykov', 'hardwork', 'taxi', 'driver', 'lyosha', 'saxophonist', 'develop', 'bizarr', 'loveh', 'relationship', 'despit', 'prejudic', 'realiz', 'arent', 'differ'])]

In [None]:
cv = CountVectorizer(inputCol = "plot_length", outputCol = "features", minDF= 2.0)
cv_model = cv.fit(data)
cv_result = cv_model.transform(data)
cv_result.show(truncate=False)

In [87]:
cv_result.printSchema()

root
 |-- movie_id: string (nullable = true)
 |-- plot: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- genre: string (nullable = true)
 |-- label: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- col_0: integer (nullable = true)
 |-- col_1: integer (nullable = true)
 |-- col_2: integer (nullable = true)
 |-- col_3: integer (nullable = true)
 |-- col_4: integer (nullable = true)
 |-- col_5: integer (nullable = true)
 |-- col_6: integer (nullable = true)
 |-- col_7: integer (nullable = true)
 |-- col_8: integer (nullable = true)
 |-- col_9: integer (nullable = true)
 |-- col_10: integer (nullable = true)
 |-- col_11: integer (nullable = true)
 |-- col_12: integer (nullable = true)
 |-- col_13: integer (nullable = true)
 |-- col_14: integer (nullable = true)
 |-- col_15: integer (nullable = true)
 |-- col_16: integer (nullable = true)
 |-- col_17: integer (nullable = true)
 |-- col_18: integer (nullable = true)
 |-- col_19: intege

In [None]:
cv_result.head(1)

In [68]:
print(len(cv_model.vocabulary))

42783


In [19]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, OneVsRest
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [24]:
lr = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_0', fitIntercept = True)
lrModel = lr.fit(cv_result)
print(lrModel.summary.accuracy)



lr1 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_1', fitIntercept = True)
lrModel1 = lr1.fit(cv_result)
print(lrModel1.summary.accuracy)

lr2 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_2', fitIntercept = True)
lrModel2 = lr2.fit(cv_result)
print(lrModel2.summary.accuracy)

lr3 = LogisticRegression(maxIter=20, elasticNetParam = 0, featuresCol = 'features', labelCol='col_3', fitIntercept = True)
lrModel3 = lr3.fit(cv_result)
print(lrModel3.summary.accuracy)

lr4 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_4', fitIntercept = True)
lrModel4 = lr4.fit(cv_result)
print(lrModel4.summary.accuracy)

lr5 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_5', fitIntercept = True)
lrModel5 = lr5.fit(cv_result)
print(lrModel5.summary.accuracy)

lr6 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_6', fitIntercept = True)
lrModel6 = lr6.fit(cv_result)
print(lrModel6.summary.accuracy)

lr7 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_7', fitIntercept = True)
lrModel7 = lr7.fit(cv_result)
print(lrModel7.summary.accuracy)

lr8 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_8', fitIntercept = True)
lrModel8 = lr8.fit(cv_result)
print(lrModel8.summary.accuracy)

lr9 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_9', fitIntercept = True)
lrModel9 = lr9.fit(cv_result)
print(lrModel9.summary.accuracy)

lr10 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_10', fitIntercept = True)
lrModel10 = lr10.fit(cv_result)
print(lrModel10.summary.accuracy)

lr11 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_11', fitIntercept = True)
lrModel11 = lr11.fit(cv_result)
print(lrModel11.summary.accuracy)

lr12 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_12', fitIntercept = True)
lrModel12 = lr12.fit(cv_result)
print(lrModel12.summary.accuracy)

lr13 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_13', fitIntercept = True)
lrModel13 = lr13.fit(cv_result)
print(lrModel13.summary.accuracy)

lr14 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_14', fitIntercept = True)
lrModel14 = lr14.fit(cv_result)
print(lrModel14.summary.accuracy)

lr15 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_15', fitIntercept = True)
lrModel15 = lr15.fit(cv_result)
print(lrModel15.summary.accuracy)

lr16 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_16', fitIntercept = True)
lrModel16 = lr16.fit(cv_result)
print(lrModel16.summary.accuracy)

lr17 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_17', fitIntercept = True)
lrModel17 = lr17.fit(cv_result)
print(lrModel17.summary.accuracy)

lr18 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_18', fitIntercept = True)
lrModel18 = lr18.fit(cv_result)
print(lrModel18.summary.accuracy)

lr19 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_19', fitIntercept = True)
lrModel19 = lr19.fit(cv_result)
print(lrModel19.summary.accuracy)

0.998296312964094
0.9990356488476003
0.9995821145006268
0.9997749847311067
0.9998392748079334
0.9997749847311067
0.9997428396926934
0.9999357099231734
0.9998392748079334
0.9999035648847601
1.0
0.9999035648847601
0.9998714198463468
0.9996785496158668
0.9999357099231734
1.0
0.9999357099231734
1.0
1.0
1.0


In [None]:
"""paramGrid = (ParamGridBuilder().addGrid(lr.regParam,[0.1,0.3,0.5])\
             .addGrid(lr.elasticNetParam, [0.0,0.1,0.2])\
             .addGrid(lr.labelCol, 'col_0')
            .addGrid(lr.maxIter, [10,20,50]).build())
evaluator = MulticlassClassificationEvaluator(predictionCol = 'prediction')
cv = CrossValidator(estimator=lr, evaluator=evaluator, estimatorParamMaps = paramGrid, numFolds=5)
cvModel = cv.fit(cv_result)"""

In [80]:
"""idf = IDF(inputCol="rawfeatures", outputCol = "features")
idfModel = idf.fit(cv_result)
rescaledData = idfModel.transform(cv_result)
idfModel = lr.fit(rescaledData)
print(idfModel.summary.accuracy)"""

0.9984891831945739


In [81]:
"""idfModel1 = lr1.fit(rescaledData)
print(idfModel1.summary.accuracy)"""

0.9994856793853869


In [None]:
"""rf = RandomForestClassifier(featuresCol = 'features', labelCol='col_0', numTrees = 10, maxDepth = 4)
rfModel = rf.fit(cv_result)
print(rfModel.summary.accuracy)"""

In [133]:
test_df = spark.read.csv("/home/cse587/Downloads/diccsvs/test.csv", escape ="\"", inferSchema = True, header = True)
test_df = test_df.na.drop(subset=["plot","movie_id"])
test_df.printSchema()

#clean text
test_df_clean = test_df.select('movie_id', 'movie_name', (lower(regexp_replace('plot',"[^a-zA-Z\\s]","")).alias('plot')))

#Tokenize Plot Text
test_df_words_token = tokenizer.transform(test_df_clean).select("movie_id","movie_name","plot_token")

#Remove StopWords
test_df_words_token_rem_stopwor = remover.transform(test_df_words_token).select("movie_id","movie_name","plot_clean")

root
 |-- movie_id: integer (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- plot: string (nullable = true)



In [134]:

#Text Stemming
test_df_stemmed = test_df_words_token_rem_stopwor.withColumn("words_stemmed" ,stem_udf("plot_clean")).select('movie_id',"words_stemmed")
test_df_stemmed = test_df_stemmed.withColumnRenamed("words_stemmed","plot")

test_df_stemmed = test_df_stemmed.withColumn('plot_length', filter_length_udf(col('plot')))
test_data = test_df_stemmed.select("movie_id","plot","plot_length")
#test_data.select('plot','plot_length').head(1)
test_data = test_df_stemmed.select("movie_id","plot","plot_length")
#Count Vectorizer
#test_cv_result = cv_model.transform(test_data)
#TF-IDF
test_cv_result = pipefit.transform(test_data)

In [None]:
test_cv_result.head(1)

In [135]:
predictions = lrModel.transform(test_cv_result)
#t_predictions = predictions.select("*").toPandas()
predictions1 = lrModel1.transform(test_cv_result)
#t1_predictions = predictions1.select("*").toPandas()
predictions2 = lrModel2.transform(test_cv_result)
#t2_predictions = predictions2.select("*").toPandas()
predictions3 = lrModel3.transform(test_cv_result)
#t3_predictions = predictions3.select("*").toPandas()
predictions4 = lrModel4.transform(test_cv_result)
#t4_predictions = predictions4.select("*").toPandas()
predictions5 = lrModel5.transform(test_cv_result)
#t5_predictions = predictions5.select("*").toPandas()
predictions6 = lrModel6.transform(test_cv_result)
#t6_predictions = predictions6.select("*").toPandas()
predictions7 = lrModel7.transform(test_cv_result)
#t7_predictions = predictions7.select("*").toPandas()
predictions8 = lrModel8.transform(test_cv_result)
#t8_predictions = predictions8.select("*").toPandas()
predictions9 = lrModel9.transform(test_cv_result)
#t9_predictions = predictions9.select("*").toPandas()
predictions10 = lrModel10.transform(test_cv_result)
#t10_predictions = predictions10.select("*").toPandas()
predictions11 = lrModel11.transform(test_cv_result)
#t11_predictions = predictions11.select("*").toPandas()
predictions12 = lrModel12.transform(test_cv_result)
#t12_predictions = predictions12.select("*").toPandas()
predictions13 = lrModel13.transform(test_cv_result)
#t13_predictions = predictions13.select("*").toPandas()
predictions14 = lrModel14.transform(test_cv_result)
#t14_predictions = predictions14.select("*").toPandas()
predictions15 = lrModel15.transform(test_cv_result)
#t15_predictions = predictions15.select("*").toPandas()
predictions16 = lrModel16.transform(test_cv_result)
#t16_predictions = predictions16.select("*").toPandas()
predictions17 = lrModel17.transform(test_cv_result)
#t17_predictions = predictions17.select("*").toPandas()
predictions18 = lrModel18.transform(test_cv_result)
#t18_predictions = predictions18.select("*").toPandas()
predictions19 = lrModel19.transform(test_cv_result)
#t19_predictions = predictions19.select("*").toPandas()

In [126]:
predictions12.select('prediction').head(1)

[Row(prediction=0.0)]

In [127]:
from pyspark.sql import functions as F

In [136]:
dict = {}
movie_id = predictions.select(F.collect_list('movie_id')).first()[0]
pred1 = predictions.select(F.collect_list('prediction')).first()[0]
pred2 = predictions1.select(F.collect_list('prediction')).first()[0]
pred3 = predictions2.select(F.collect_list('prediction')).first()[0]
pred4 = predictions3.select(F.collect_list('prediction')).first()[0]
pred5 = predictions4.select(F.collect_list('prediction')).first()[0]
pred6 = predictions5.select(F.collect_list('prediction')).first()[0]
pred7 = predictions6.select(F.collect_list('prediction')).first()[0]
print("enna")
pred8 = predictions7.select(F.collect_list('prediction')).first()[0]
pred9 = predictions8.select(F.collect_list('prediction')).first()[0]
pred10 = predictions9.select(F.collect_list('prediction')).first()[0]
pred11 = predictions10.select(F.collect_list('prediction')).first()[0]
pred12 = predictions11.select(F.collect_list('prediction')).first()[0]
pred13 = predictions12.select(F.collect_list('prediction')).first()[0]
pred14 = predictions13.select(F.collect_list('prediction')).first()[0]
pred15 = predictions14.select(F.collect_list('prediction')).first()[0]
pred16 = predictions15.select(F.collect_list('prediction')).first()[0]
pred17 = predictions16.select(F.collect_list('prediction')).first()[0]
pred18 = predictions17.select(F.collect_list('prediction')).first()[0]
pred19 = predictions18.select(F.collect_list('prediction')).first()[0]
pred20 = predictions19.select(F.collect_list('prediction')).first()[0]

enna


In [49]:
#evaluator = MulticlassClassificationEvaluator(predictionCol = "prediction")
#evaluator.evaluate(predictions)

In [137]:
print(len(pred20))

7777


In [138]:
from csv import writer

In [139]:
def append_list_as_row(filename, elements):
    with open(filename, 'a+', newline='') as write_obj:
        csv_writer = writer(write_obj)
        csv_writer.writerow(elements)

In [140]:
for i in range(0,len(pred1)):
    p = ""
    p+=str(int(pred1[i]))
    p+=" "+str(int(pred2[i]))
    p+=" "+str(int(pred3[i]))
    p+=" "+str(int(pred4[i]))
    p+=" "+str(int(pred5[i]))
    p+=" "+str(int(pred6[i]))
    p+=" "+str(int(pred7[i]))
    p+=" "+str(int(pred8[i]))
    p+=" "+str(int(pred9[i]))
    p+=" "+str(int(pred10[i]))
    p+=" "+str(int(pred11[i]))
    p+=" "+str(int(pred12[i]))
    p+=" "+str(int(pred13[i]))
    p+=" "+str(int(pred14[i]))
    p+=" "+str(int(pred15[i]))
    p+=" "+str(int(pred16[i]))
    p+=" "+str(int(pred17[i]))
    p+=" "+str(int(pred18[i]))
    p+=" "+str(int(pred19[i]))
    p+=" "+str(int(pred20[i]))
    dict[movie_id[i]] = p
    row_contents= [movie_id[i], p]
    append_list_as_row("/home/cse587/Downloads/diccsvs/final.csv", row_contents)

In [None]:
print(dict)

In [141]:
TF = HashingTF(inputCol = 'plot_length', outputCol="rawFeatures")
idf = IDF(inputCol = 'rawFeatures', outputCol="features", minDocFreq=30)
pipeline = Pipeline(stages=[TF, idf])
pipefit = pipeline.fit(data)
trainingData = pipefit.transform(data)

In [142]:
lr = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_0', fitIntercept = True)
lrModel = lr.fit(trainingData)
print(lrModel.summary.accuracy)



lr1 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_1', fitIntercept = True)
lrModel1 = lr1.fit(trainingData)
print(lrModel1.summary.accuracy)

lr2 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_2', fitIntercept = True)
lrModel2 = lr2.fit(trainingData)
print(lrModel2.summary.accuracy)

lr3 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_3', fitIntercept = True)
lrModel3 = lr3.fit(trainingData)
print(lrModel3.summary.accuracy)

lr4 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_4', fitIntercept = True)
lrModel4 = lr4.fit(trainingData)
print(lrModel4.summary.accuracy)

lr5 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_5', fitIntercept = True)
lrModel5 = lr5.fit(trainingData)
print(lrModel5.summary.accuracy)

lr6 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_6', fitIntercept = True)
lrModel6 = lr6.fit(trainingData)
print(lrModel6.summary.accuracy)

lr7 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_7', fitIntercept = True)
lrModel7 = lr7.fit(trainingData)
print(lrModel7.summary.accuracy)

lr8 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_8', fitIntercept = True)
lrModel8 = lr8.fit(trainingData)
print(lrModel8.summary.accuracy)

lr9 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_9', fitIntercept = True)
lrModel9 = lr9.fit(trainingData)
print(lrModel9.summary.accuracy)

lr10 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_10', fitIntercept = True)
lrModel10 = lr10.fit(trainingData)
print(lrModel10.summary.accuracy)

lr11 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_11', fitIntercept = True)
lrModel11 = lr11.fit(trainingData)
print(lrModel11.summary.accuracy)

lr12 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_12', fitIntercept = True)
lrModel12 = lr12.fit(trainingData)
print(lrModel12.summary.accuracy)

lr13 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_13', fitIntercept = True)
lrModel13 = lr13.fit(trainingData)
print(lrModel13.summary.accuracy)

lr14 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_14', fitIntercept = True)
lrModel14 = lr14.fit(trainingData)
print(lrModel14.summary.accuracy)

lr15 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_15', fitIntercept = True)
lrModel15 = lr15.fit(trainingData)
print(lrModel15.summary.accuracy)

lr16 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_16', fitIntercept = True)
lrModel16 = lr16.fit(trainingData)
print(lrModel16.summary.accuracy)

lr17 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_17', fitIntercept = True)
lrModel17 = lr17.fit(trainingData)
print(lrModel17.summary.accuracy)

lr18 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_18', fitIntercept = True)
lrModel18 = lr18.fit(trainingData)
print(lrModel18.summary.accuracy)

lr19 = LogisticRegression(maxIter=20, elasticNetParam = 0.1, featuresCol = 'features', labelCol='col_19', fitIntercept = True)
lrModel19 = lr19.fit(trainingData)
print(lrModel19.summary.accuracy)

0.8493040599183516
0.9086759458677554
0.9467678163875406
0.960236587482722
0.9721945417724774
0.964640457745347
0.9883956411327911
0.9993892442701469
0.9886206564016844
0.9738339387315568
0.9906457938217236
0.9951782442380018
0.9973641068501077
0.9994856793853869
0.9950175190459353
0.9993892442701469
0.9976212671574143
0.9997106946542801
0.9990999389244271
0.9996785496158668


In [143]:
from pyspark.ml.feature import Word2Vec

In [None]:
word2vec = Word2Vec(vectorSize = 100, minCount = 3, inputCol="", outputCol = "" )