## 1. Import and initialize Pyspark environment

Import Spark environment using findspark
Initialize Spark environment
Create SparkContext


In [1]:
!ls ./data/input/

movies.txt.gz


In [2]:
from os import path

ROOT_DIR = "./"
DATA_DIR = path.join(ROOT_DIR, 'data')
INPUT_DATA_PATH = path.join(ROOT_DIR, DATA_DIR, 'input')

## 2. Read data

Read all the fields in the txt file by applying filter and map 


In [3]:
data = sc.textFile(INPUT_DATA_PATH)

## 3. Data Exploration

In [4]:
"""
Prepare the data:
    ` Split lines into 2 part: the Key and the Value
    ` Remove blank lines
"""

data = data.map(lambda line: line.split(': ', 1)) \
    .filter(lambda line: len(line)>1)

In [5]:
data.take(100)

[[u'product/productId', u'B003AI2VGA'],
 [u'review/userId', u'A141HP4LYPWMSR'],
 [u'review/profileName', u'Brian E. Erland "Rainbow Sphinx"'],
 [u'review/helpfulness', u'7/7'],
 [u'review/score', u'3.0'],
 [u'review/time', u'1182729600'],
 [u'review/summary',
  u'"There Is So Much Darkness Now ~ Come For The Miracle"'],
 [u'review/text',
  u'Synopsis: On the daily trek from Juarez, Mexico to El Paso, Texas an ever increasing number of female workers are found raped and murdered in the surrounding desert. Investigative reporter Karina Danes (Minnie Driver) arrives from Los Angeles to pursue the story and angers both the local police and the factory owners who employee the undocumented aliens with her pointed questions and relentless quest for the truth.<br /><br />Her story goes nationwide when a young girl named Mariela (Ana Claudia Talancon) survives a vicious attack and walks out of the desert crediting the Blessed Virgin for her rescue. Her story is further enhanced when the "Wounds

In [6]:
"""
Count the data
To make sure there is no data incompliance 
"""

counter = data.countByKey()
counter

defaultdict(int,
            {u"\t infects a dog, and in a wonderfully gory scene, literally busts out of the dog's body (if you're an animal lover, don't watch that).  The Thing then begins to take the form of members of the team of scientists, and this is where the suspense really builds.  The people are trapped, no way out, with each other, and one of them is the Thing.  At first, when I saw the end of the movie, I thought, &quot;What the hell kind of ending was that?&quot;  But it is really cool, actually.  It just goes back to the question that is asked throughout the whole movie": 1,
             u'product/productId': 7911684,
             u'review/helpfulness': 7911684,
             u'review/profileName': 7911684,
             u'review/score': 7911684,
             u'review/summary': 7911684,
             u'review/text': 7911684,
             u'review/time': 7911684,
             u'review/userId': 7911684})

In [54]:
"""
Apply some countByValue to findout:

    ` Number of reviews from each user 
    ` Number of reviews each movie get
"""

nreview_by_user = data\
    .map(lambda line: (line[0],line[1])) \
    .filter(lambda (k, v): k == 'review/userId') \
    .map(lambda (k, v): v)\
    .countByValue()

nreview_for_movie = data \
    .map(lambda line: (line[0],line[1])) \
    .filter(lambda (k, v): k=='product/productId') \
    .map(lambda (k, v): v)\
    .countByValue()

nreview_by_user, nreview_for_movie

(defaultdict(int,
             {u'A19WNZKV6F2LPY': 39,
              u'A3QE2SBOC3TT9E': 2,
              u'A1Y3YXKTRLVFYU': 12,
              u'A3VHU3SJIKXCRB': 5,
              u'A3JO2U6LCNLN25': 29,
              u'A21TGXU3IAAZYG': 7,
              u'A2H2CQQ2DKESTH': 2,
              u'AXFI5GC8L7T3J': 3,
              u'A3DIE2TCLHJ32I': 1,
              u'A2Z62OYF83O8UO': 1,
              u'A1W35N82WPQ086': 2,
              u'A15R8PE0TZXGHA': 1,
              u'A2J23N050TQP7A': 6,
              u'A2CIOZ1DRT2VU6': 4,
              u'A2XK4BFJBDU9H': 2,
              u'A2PJQE1X21XWB1': 4,
              u'A2AMNN5K9XEI85': 4,
              u'A21BS8RZ80IIQE': 2,
              u'A1Y8FOSGJDA0M6': 21,
              u'AWTIQ41SH2RCQ': 3,
              u'A2JKML8WO0LSV9': 5,
              u'A3QMYGIY6EISC8': 115,
              u'A15DN5UWZAMHQI': 5,
              u'A15CX39KS2KRK4': 2,
              u'AZ1Y7X5KTRMZQ': 2,
              u'A46JZ6NL7KM2F': 1,
              u'A26UK7IG6HKZBK': 3,
         

In [55]:
"""
Apply aggregation to findout: 
    ` Who are the most active users?
    ` Wh bat is the most common movies?
"""

num = 10

most_active = sc.parallelize(nreview_by_user.iteritems()) \
    .takeOrdered(num, key=lambda v: -v[1])

most_common = sc.parallelize(nreview_for_movie.iteritems()) \
    .takeOrdered(num, key=lambda v: -v[1])

print("Some of the most active users: ", most_active)
print("Some of the most common movies: ", most_common)


# Need to verify

('Some of the most active users: ', [(u'A16CZRQL23NOIW', 10793), (u'A3LZGLA88K0LA0', 10304), (u'ANCOMAI0I7LVG', 9790), (u'A2NJO6YE954DBH', 9713), (u'ABO2ZI2Y5DQ9T', 8020), (u'A35ZK3M8L9JUPX', 7695), (u'A39CX0EE4BZCZC', 7659), (u'A10ODC971MDHV8', 7548), (u'A328S9RN3U5M68', 7448), (u'AJKWF4W7QD4NS', 7144)])
('Some of the most common movies: ', [(u'B002QZ1RS6', 957), (u'B007FQDPL8', 956), (u'B001KZG99A', 925), (u'B001GAPC1K', 925), (u'B001I1NGHY', 925), (u'B001C08RHA', 925), (u'B001FD5KJM', 925), (u'B00005JPS8', 925), (u'B001FZ9AAU', 924), (u'B001HUHBE0', 924)])


## 4. Visualization

## 5. Data transformation and save

We want to build a Recommendation system to suggest movies to users who might interested in based on their historical reviews. Rating score should be used. So we will make some information extracting and transforming job this part.

In [None]:
ratings = data.filter(lambda l: l[0] in ['product/productId', 'review/userId', 'review/score'])\
    .map(lambda line: line[1])\
    .zipWithIndex()\
    .map(lambda (k, v): (v//3, k)) \
    .groupByKey()\
    .map(lambda (k, (p, u, r)): (k, p, u, r)).toDF()

ratings.show()

In [57]:
""" Store the ratings data as zipped csv """

target = os.path.join(DATA_DIR, 'ratings')

# ratings.collumns = ['ID', 'movieId', 'userId', 'score']
ratings.write.csv(target, mode='overwrite', compression='gzip')