# Split and Dummy MovieTweetings with lambda functions

Data source: [MovieTweetings Data](https://github.com/sidooms/MovieTweetings/tree/master/recsyschallenge2014).  You can read more about this project and the dataset from the [publication here](http://crowdrec2013.noahlab.com.hk/papers/crowdrec2013_Dooms.pdf).

In [1]:
# load libraries

import numpy as np
import pandas as pd
import datetime

import EDA_functions as EDA
import cleaning_functions as cleaning

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()   # sns.set_style('whitegrid')
%matplotlib inline  

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1) 

In [2]:
# import datasets

url_movies = 'https://raw.githubusercontent.com/sidooms/MovieTweetings/master/latest/movies.dat'
url_reviews = 'https://raw.githubusercontent.com/sidooms/MovieTweetings/master/latest/ratings.dat'

movies = pd.read_csv(url_movies, delimiter='::', header=None, 
                     names=['movie_id', 'movie', 'genre'], 
                     dtype={'movie_id': object}, encoding="utf8", engine='python')
reviews = pd.read_csv(url_reviews, delimiter='::', header=None, 
                      names=['user_id', 'movie_id', 'rating', 'timestamp'], 
                      dtype={'movie_id': object, 'user_id': object, 
                      'timestamp': object}, encoding="utf8", engine='python')

### Check Data Structure

In [3]:
movies.head()

Unnamed: 0,movie_id,movie,genre
0,8,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short
1,10,La sortie des usines Lumière (1895),Documentary|Short
2,12,The Arrival of a Train (1896),Documentary|Short
3,25,The Oxford and Cambridge University Boat Race (1895),
4,91,Le manoir du diable (1896),Short|Horror


In [4]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32403 entries, 0 to 32402
Data columns (total 3 columns):
movie_id    32403 non-null object
movie       32403 non-null object
genre       32170 non-null object
dtypes: object(3)
memory usage: 759.5+ KB


In [5]:
# check for NaN

cleaning.list_NaN(movies)

Number of NaN per column:
genre: 233 (0.01%)


In [6]:
# check for duplicates

cleaning.list_duplicates(movies)

Number of column-wise duplicates per column:
movie: 48 unique duplicate values (99 total duplicates)
genre: 726 unique duplicate values (31923 total duplicates)


In [7]:
reviews.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,68646,10,1381620027
1,1,113277,10,1379466669
2,2,422720,8,1412178746
3,2,454876,8,1394818630
4,2,790636,7,1389963947


In [8]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 744556 entries, 0 to 744555
Data columns (total 4 columns):
user_id      744556 non-null object
movie_id     744556 non-null object
rating       744556 non-null int64
timestamp    744556 non-null object
dtypes: int64(1), object(3)
memory usage: 22.7+ MB


In [9]:
# check for NaN

cleaning.list_NaN(reviews)

Number of NaN per column:


In [10]:
# check for duplicates

cleaning.list_duplicates(movies)

Number of column-wise duplicates per column:
movie: 48 unique duplicate values (99 total duplicates)
genre: 726 unique duplicate values (31923 total duplicates)


### To Do:

#### Movies
* Pull the date from the title and create new column 'date'
* Dummy the date column with 1's and 0's for each century of a movie (1800's, 1900's, and 2000's)
* Dummy the genre column with 1's and 0's for each genre (split them)

#### Reviews
* Create a date out of time stamp in format '%Y-%m-%d %H:%M:%S'


## Clean Data

### Movies

In [11]:
# pull date from the title if it exists and create new column 'date'

#define and apply lambda function
create_date = lambda val: val[-5:-1] if val[-1] == ')' else np.nan
movies['date'] = movies['movie'].apply(create_date)

In [12]:
#check results

movies.head()

Unnamed: 0,movie_id,movie,genre,date
0,8,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short,1894
1,10,La sortie des usines Lumière (1895),Documentary|Short,1895
2,12,The Arrival of a Train (1896),Documentary|Short,1896
3,25,The Oxford and Cambridge University Boat Race (1895),,1895
4,91,Le manoir du diable (1896),Short|Horror,1896


In [13]:
# Return century of movie as a dummy column

# define function
def add_movie_year(val):
    if val[:2] == yr:
        return 1
    else:
        return 0
        
# apply function
for yr in ['18', '19', '20']:
    movies[str(yr) + "00's"] = movies['date'].apply(add_movie_year)

In [14]:
#check results

movies.head()

Unnamed: 0,movie_id,movie,genre,date,1800's,1900's,2000's
0,8,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short,1894,1,0,0
1,10,La sortie des usines Lumière (1895),Documentary|Short,1895,1,0,0
2,12,The Arrival of a Train (1896),Documentary|Short,1896,1,0,0
3,25,The Oxford and Cambridge University Boat Race (1895),,1895,1,0,0
4,91,Le manoir du diable (1896),Short|Horror,1896,1,0,0


In [15]:
# split genre column and return values for columns

# create set of unique genres
# the try, error block handles NaN values
genres = []
for row in movies['genre']:
    try:
        genres.extend(row.split("|"))
    
    except AttributeError:
        pass
        
genres = set(genres)
print("Genres: ", genres)
len(genres)

Genres:  {'Fantasy', 'Adult', 'Mystery', 'Horror', 'Western', 'Crime', 'History', 'Sci-Fi', 'Musical', 'Short', 'Film-Noir', 'Thriller', 'Music', 'Adventure', 'Game-Show', 'Documentary', 'Animation', 'Talk-Show', 'Comedy', 'Drama', 'Reality-TV', 'Family', 'Sport', 'War', 'Romance', 'News', 'Biography', 'Action'}


28

In [16]:
# define function to split genre column an return values for genre
def split_genres(row):
    try:
        if row.find(gen) >-1:  # str.find() looks for string within string and returns index if found and -1 otherwise.
            return 1
        else:
            return 0
    except AttributeError:
        return 0

# apply function for each genre
for gen in genres:        
    movies[gen] = movies['genre'].apply(split_genres)

In [17]:
# check results

movies.head() 

Unnamed: 0,movie_id,movie,genre,date,1800's,1900's,2000's,Fantasy,Adult,Mystery,Horror,Western,Crime,History,Sci-Fi,Musical,Short,Film-Noir,Thriller,Music,Adventure,Game-Show,Documentary,Animation,Talk-Show,Comedy,Drama,Reality-TV,Family,Sport,War,Romance,News,Biography,Action
0,8,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short,1894,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,10,La sortie des usines Lumière (1895),Documentary|Short,1895,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,12,The Arrival of a Train (1896),Documentary|Short,1896,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,25,The Oxford and Cambridge University Boat Race (1895),,1895,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,91,Le manoir du diable (1896),Short|Horror,1896,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Reviews

In [21]:
# change timestamp to date, drop timestamp column

change_timestamp = lambda val: datetime.datetime.fromtimestamp(int(val)).strftime('%Y-%m-%d %H:%M:%S')
reviews['date'] = reviews['timestamp'].apply(change_timestamp)
reviews.drop('timestamp', axis=1, inplace=True)

In [22]:
# check results

reviews.head()

Unnamed: 0,user_id,movie_id,rating,date
0,1,68646,10,2013-10-13 01:20:27
1,1,113277,10,2013-09-18 03:11:09
2,2,422720,8,2014-10-01 17:52:26
3,2,454876,8,2014-03-14 18:37:10
4,2,790636,7,2014-01-17 14:05:47


---