In [1]:
import findspark
findspark.init('/home/cse587/spark-2.4.0-bin-hadoop2.7')

In [2]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
from pyspark.ml.feature import HashingTF, IDF
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vector as MLLibVector, Vectors as MLLibVectors
from pyspark.ml.feature  import RegexTokenizer,StopWordsRemover,CountVectorizer
import pandas as pd
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
import pyspark.sql.functions as foo

In [3]:
sc=pyspark.SparkContext()

In [4]:
spark= SparkSession \
       .builder \
       .appName("Assignment3") \
       .config("spark.some.config.option","some-value") \
       .getOrCreate() 

In [5]:
df_train = pd.read_csv('train.csv')
train=spark.createDataFrame(df_train)
train.show()

+--------+--------------------+--------------------+--------------------+
|movie_id|          movie_name|                plot|               genre|
+--------+--------------------+--------------------+--------------------+
|23890098|          Taxi Blues|Shlykov, a hard-w...|['World cinema', ...|
|31186339|    The Hunger Games|The nation of Pan...|['Action/Adventur...|
|20663735|          Narasimham|Poovalli Induchoo...|['Musical', 'Acti...|
| 2231378|  The Lemon Drop Kid|The Lemon Drop Ki...|          ['Comedy']|
|  595909|   A Cry in the Dark|Seventh-day Adven...|['Crime Fiction',...|
| 5272176|            End Game|The president is ...|['Action/Adventur...|
| 1952976|          Dark Water|{{plot}} The film...|['Thriller', 'Dra...|
|24225279|                Sing|The story begins ...|           ['Drama']|
| 2462689|       Meet John Doe|Infuriated at bei...|['Black-and-white...|
|20532852|Destination Meatball|A line of people ...|['Animation', 'Sh...|
|15401493|    Husband for Hire|Lola  a

In [6]:
df_map = pd.read_csv('mapping.csv')
mapping=spark.createDataFrame(df_map,['id','genre'])
totalgenre=mapping.count()

In [7]:
df_test = pd.read_csv('test.csv')
test=spark.createDataFrame(df_test)
test.show()

+--------+--------------------+--------------------+
|movie_id|          movie_name|                plot|
+--------+--------------------+--------------------+
| 1335380|              Exodus|The film is based...|
|29062594|A la salida nos v...|A group of teenag...|
| 9252321|   Come Back, Africa|This story of a Z...|
|13455076|       A Merry Mixup|The Stooges play ...|
|24165951|        Getting Even|A soldier-of-fort...|
| 1925869|  River of No Return|Set in the Northw...|
|10799612|          Amici miei|Like in many othe...|
|28238240|Mickey's Big Game...|Mickey and the Sc...|
|17124781|The Good, the Bad...|In the desert wil...|
|28207941|    The Dancing Fool|Bimbo and Koko ar...|
|19174305|              Tahaan|Tahaan  lives wit...|
|18392317|     Mysterious Mose|Betty is startled...|
|34420857|Kelviyum Naane Pa...|Nirmal ([[Karthik...|
| 4039635|   First on the Moon|A group of journa...|
| 8034072|  Journey of a Woman|Vaibhavari Sahay,...|
| 4016437|     Sophie's Choice|In 1947, the mo

In [8]:
rt = RegexTokenizer(inputCol="plot", outputCol="regex",pattern="\\w+", gaps=False)
train=rt.transform(train)
swr = StopWordsRemover(inputCol="regex", outputCol="filtered")
train=swr.transform(train)
train.select('movie_id','filtered').show()

+--------+--------------------+
|movie_id|            filtered|
+--------+--------------------+
|23890098|[shlykov, hard, w...|
|31186339|[nation, panem, c...|
|20663735|[poovalli, induch...|
| 2231378|[lemon, drop, kid...|
|  595909|[seventh, day, ad...|
| 5272176|[president, way, ...|
| 1952976|[plot, film, open...|
|24225279|[story, begins, h...|
| 2462689|[infuriated, told...|
|20532852|[line, people, dr...|
|15401493|[lola, attempts, ...|
|18188932|[milan, goran, tw...|
| 2940516|[bumbling, pirate...|
| 1480747|[plot, following,...|
|24448645|[despite, lucy, r...|
|15072401|[alan, colby, hei...|
| 4018288|[debbie, favorite...|
| 4596602|[ashes, ashes, se...|
|15224586|[film, follows, e...|
|15585766|[three, friends, ...|
+--------+--------------------+
only showing top 20 rows



In [9]:
htf = HashingTF(inputCol="filtered", outputCol="HTFfeatures")
train= htf.transform(train)
idf = IDF(inputCol="HTFfeatures", outputCol="features")
idfModel= idf.fit(train)
train = idfModel.transform(train)
train.show()

+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|movie_id|          movie_name|                plot|               genre|               regex|            filtered|         HTFfeatures|            features|
+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|23890098|          Taxi Blues|Shlykov, a hard-w...|['World cinema', ...|[shlykov, a, hard...|[shlykov, hard, w...|(262144,[2437,127...|(262144,[2437,127...|
|31186339|    The Hunger Games|The nation of Pan...|['Action/Adventur...|[the, nation, of,...|[nation, panem, c...|(262144,[991,1739...|(262144,[991,1739...|
|20663735|          Narasimham|Poovalli Induchoo...|['Musical', 'Acti...|[poovalli, induch...|[poovalli, induch...|(262144,[119,571,...|(262144,[119,571,...|
| 2231378|  The Lemon Drop Kid|The Lemon Drop Ki...|

In [10]:
test=rt.transform(test)
test=swr.transform(test)
test= htf.transform(test)
idf1 = IDF(inputCol="HTFfeatures", outputCol="features")
idfModel1= idf1.fit(test)
test = idfModel1.transform(test)
test.select("movie_id","features").show()

+--------+--------------------+
|movie_id|            features|
+--------+--------------------+
| 1335380|(262144,[1728,261...|
|29062594|(262144,[6068,191...|
| 9252321|(262144,[1598,208...|
|13455076|(262144,[3294,618...|
|24165951|(262144,[4098,644...|
| 1925869|(262144,[535,3294...|
|10799612|(262144,[5053,538...|
|28238240|(262144,[23060,30...|
|17124781|(262144,[5232,733...|
|28207941|(262144,[9726,626...|
|19174305|(262144,[2710,392...|
|18392317|(262144,[5213,606...|
|34420857|(262144,[11275,13...|
| 4039635|(262144,[571,1640...|
| 8034072|(262144,[991,4200...|
| 4016437|(262144,[5595,783...|
| 1520023|(262144,[14,535,5...|
|24589422|(262144,[1998,249...|
|35068740|(262144,[2710,484...|
|21132951|(262144,[1841,392...|
+--------+--------------------+
only showing top 20 rows



In [11]:
genre_map=mapping.select("genre","id").rdd.collectAsMap()
for key in genre_map:
    print(key,genre_map[key])

Drama 0
Comedy 1
Romance Film 2
Thriller 3
Action 4
World cinema 5
Crime Fiction 6
Horror 7
Black-and-white 8
Indie 9
Action/Adventure 10
Adventure 11
Family Film 12
Short Film 13
Romantic drama 14
Animation 15
Musical 16
Science Fiction 17
Mystery 18
Romantic comedy 19


In [12]:
def mapgenre(m):
    temp=[]
    for element in m[1:-1].split(","):
        temp.append(genre_map.get(element.strip()[1:-1]))
    temp.sort()
    return temp
udf1=foo.udf(mapgenre,ArrayType(IntegerType()))

train=train.withColumn("mapped",udf1("genre"))

In [13]:
train.select("mapped").show()

+----------------+
|          mapped|
+----------------+
|          [0, 5]|
|  [0, 4, 10, 17]|
|      [0, 4, 16]|
|             [1]|
|       [0, 5, 6]|
|   [0, 3, 4, 10]|
|       [0, 3, 7]|
|             [0]|
|[0, 1, 2, 8, 19]|
|    [12, 13, 15]|
|             [1]|
|    [0, 1, 5, 6]|
|             [1]|
|             [1]|
|             [7]|
|   [3, 6, 7, 18]|
|             [0]|
| [2, 3, 4, 6, 9]|
|          [0, 9]|
|             [0]|
+----------------+
only showing top 20 rows



In [14]:
def finalmap(m):
    temp=[]
    for i in range(0,totalgenre):
        temp.append(0)
    for i in m:
        temp[i]=1
    return temp
udf2=foo.udf(finalmap,ArrayType(IntegerType()))
train=train.withColumn("label",udf2("mapped"))

In [15]:
train.select("label").show()

+--------------------+
|               label|
+--------------------+
|[1, 0, 0, 0, 0, 1...|
|[1, 0, 0, 0, 1, 0...|
|[1, 0, 0, 0, 1, 0...|
|[0, 1, 0, 0, 0, 0...|
|[1, 0, 0, 0, 0, 1...|
|[1, 0, 0, 1, 1, 0...|
|[1, 0, 0, 1, 0, 0...|
|[1, 0, 0, 0, 0, 0...|
|[1, 1, 1, 0, 0, 0...|
|[0, 0, 0, 0, 0, 0...|
|[0, 1, 0, 0, 0, 0...|
|[1, 1, 0, 0, 0, 1...|
|[0, 1, 0, 0, 0, 0...|
|[0, 1, 0, 0, 0, 0...|
|[0, 0, 0, 0, 0, 0...|
|[0, 0, 0, 1, 0, 0...|
|[1, 0, 0, 0, 0, 0...|
|[0, 0, 1, 1, 1, 0...|
|[1, 0, 0, 0, 0, 0...|
|[1, 0, 0, 0, 0, 0...|
+--------------------+
only showing top 20 rows



In [16]:
out=[]
for i in range(0,totalgenre):
    
    def parse(row):
    
        return LabeledPoint(row.label[i], MLLibVectors.fromML(row.features))
    parsedOutput = train.rdd.map(parse)
    modelpred = LogisticRegressionWithLBFGS.train(parsedOutput)
    pred = test.rdd.map(lambda t: (t.movie_id, modelpred.predict(MLLibVectors.fromML(t.features))))
    out.append(pred.collect())

In [20]:
output={}
for element in out:
    for temp in element:
        if temp[0] not in output:
            output[temp[0]]=[]
            output[temp[0]].append(int(temp[1]))
        else:
            output[temp[0]].append(int(temp[1]))


for key in output:
    print(key,output[key])

1335380 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
29062594 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
9252321 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
13455076 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
24165951 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1925869 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
10799612 [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
28238240 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
17124781 [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
28207941 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0]
19174305 [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
18392317 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0]
34420857 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
4039635 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
8034072 [1, 0, 1, 0, 0, 

16953189 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
5148040 [0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0]
76313 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
3370462 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
12231217 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
27214582 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
12228807 [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0]
9034707 [1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
11684047 [1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
9168916 [1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
4119854 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
24542564 [1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
31288540 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
31212825 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
706878 [0, 0, 0, 1, 0, 0, 1,

26822699 [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
32735343 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
4073660 [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
33598983 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
24050844 [1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
23420352 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2523915 [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
14733483 [1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0]
2385928 [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
13720693 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
4198067 [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0]
20951471 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
33165833 [1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
5650338 [1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
5374123 [1, 0, 0, 0, 0, 0

31339377 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
22706056 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
33845451 [1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
15698879 [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
2451503 [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
27856465 [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
3826340 [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
3177256 [0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
4471177 [1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
2350468 [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
19299521 [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0]
80923 [1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
31979208 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
3860172 [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
68082 [0, 1, 0, 0, 0, 0, 0, 0

In [None]:
# df = pd.DataFrame.from_dict(output,orient='index')
# df['predictions'] = df[df.columns[:]].apply(lambda x: ' '.join(x.dropna().astype(str)),axis=1)
# df.to_csv('testingsaar.csv',columns = ['predictions'])