In [1]:
import os
import numpy as np
import pandas as pd
import math
import xlrd
import random
import matplotlib.pyplot as plt
from scipy.stats import linregress as lreg
pd.set_option('precision', 10)
import colorsys

In [2]:
# Test 1: load 100k dataset
# Users
user_columns=['user_id', 'age', 'gender', 'occupation', 'zipcode'] # | - separator
users=pd.read_csv('ml-100k/u.user',sep='|',names=user_columns,encoding='latin-1')

# Movies ratings
rating_columns = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=rating_columns,
                      encoding='latin-1')

# Movie Genres
genres=pd.read_csv('ml-100k/u.genre',sep='|',names=['genre'],usecols=[0])

# Movie information
movie_columns=['movie_id', 'mtitle', 'release date', 'video release date', \
              'IMDb URL'] + genres['genre'].tolist()
movies=pd.read_csv('ml-100k/u.item',sep='|',names=movie_columns,encoding='latin-1')

# Merge DataFrames
users=pd.merge(users,ratings)
movierated=pd.merge(users,movies)
del movies, users, ratings 

In [104]:
# Test 2: load 20M data
# links
links=pd.read_csv('ml-20m/links.csv')

# tags
tags=pd.read_csv('ml-20m/tags.csv')

# ratings.csv
ratings=pd.read_csv('ml-20m/ratings.csv')

# movies
movies = pd.read_csv('ml-20m/movies.csv',encoding='latin-1',usecols=[0,1])
moviesg = pd.read_csv('ml-20m/movies.csv',encoding='latin-1',tupleize_cols=True,usecols=[2])
temp=movies['genres'].str.split("|",expand=False)
glist = [st for row in temp for st in row]

In [104]:
# plot the frequences of the ratings
n=movierated.rating.max()
colors=np.array(['#E50029', '#E94E04', '#EEC708', '#A5F30D', '#62F610']) # 1, 2, 3, 4, and 5 stars respectively
labels=[i+1 for i in range(n)]
indc=np.arange(n)

sizefig=plt.figure(figsize=(12,8))
abs_freq=movierated.rating.value_counts(ascending=True)
rel_freq=np.array(abs_freq)/float(abs_freq.sum())

rects=plt.bar(indc,rel_freq,width=1,color=colors,alpha=0.7)
for (idx,rect) in enumerate(rects):
    plt.gca().text(rect.get_x()+rect.get_width()/2.5,1.05*rect.get_height(),'%d'%int(abs_freq[idx+1]))
    
plt.xticks(indc+0.5,labels)
plt.xlabel('Reviews')
plt.ylabel('Relative Frequency')
plt.ylim([0,1])
plt.title('Reviews Distribution for {0} Movies'.format(len(movierated)))
plt.savefig('image1.png', bbox_inches='tight')
# plt.show()

In [106]:
# distribution of genres
# r=5 # rating
for r in range(1,6):
    mv=movierated[movierated['rating']==r]
    ratingg=mv.ix[:,12:]
    st=ratingg.describe()
    wt=np.array(st.iloc[1])
    lst=np.array(ratingg.columns.values)
    lst=[str(t) for t in lst]
    
    n=len(lst)
    HSV_tuples = [(x*1.0/n, 0.5, 0.5) for x in range(n)]
    RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples)

    colors=RGB_tuples
    # labels=[i+1 for i in range(n)]
    labels=lst
    indc=np.arange(n)
    
    sizefig=plt.figure(figsize=(12,8))
    # abs_freq=movierated.rating.value_counts(ascending=True)
    # rel_freq=np.array(abs_freq)/float(abs_freq.sum())
    # abs_freq=wt
    rel_freq=wt
    
    rects=plt.bar(indc,rel_freq,width=1,color=colors,alpha=0.7)
    # for (idx,rect) in enumerate(rects):
    #     plt.gca().text(rect.get_x()+rect.get_width()/2.,1.05*rect.get_height(),'%d'%int(abs_freq[idx+1]))
    
    plt.xticks(indc+0.5,lst,rotation='vertical')
    plt.xlabel('Genre of movies')
    plt.ylabel('Relative Frequency of movies with rating {0}'.format(r))
    plt.ylim([0,1])
    plt.title('Genre Distribution for {0} Movies with rating {1}'.format(len(movierated),r))
    plt.savefig('image2_{:d}.png'.format(r), bbox_inches='tight')
#     plt.show()
    # png(paste('image1.png',sep=''), width=960, height=960)


In [84]:
# Scraping data; example
from lxml import html
import requests
from imdb import IMDb
ia = IMDb()

url1=movierated["mtitle"][0]
movie = ia.search_movie(url1)
movie[0]['year']

1994