In [22]:
import pandas as pd

# To shift lists
from collections import deque

# To create interactive plots
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

## Movie Data

In [4]:
# Load data for all movies
movie_titles = pd.read_csv('D:/스파르타 코딩클럽/netflix_data/movie_titles.csv', 
                           encoding = 'ISO-8859-1', 
                           header = None, 
                           names = ['Id', 'Year', 'Name']).set_index('Id')

print('Shape Movie-Titles:\t{}'.format(movie_titles.shape))
movie_titles.sample(5)

Shape Movie-Titles:	(17770, 2)


Unnamed: 0_level_0,Year,Name
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
12250,1953.0,The Band Wagon
9949,1997.0,Jackie Chan's First Strike
15879,1998.0,Joni Mitchell: Painting with Words and Music
1054,1986.0,Odin: Photon Space Sailer Starlight
4024,1995.0,The Mystery of Rampo


In [14]:
sort_movie = movie_titles.sort_values(by="Year", ascending=False)
sort_movie.head(5)

Unnamed: 0_level_0,Year,Name
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
12367,2005.0,Rize
5994,2005.0,LeapFrog: Learn to Read at the Storybook Factory
14659,2005.0,Pokemon: Destiny Deoxys
5892,2005.0,WWE: Wrestlemania 21
9747,2005.0,Warm Springs


In [23]:
# Get data
data = movie_titles['Year'].value_counts().sort_index()

# Create trace
trace = go.Scatter(x = data.index,
                   y = data.values,
                   marker = dict(color = '#D941C5'))
# Create layout
layout = dict(title = '영화 개봉년도 ({})'.format(movie_titles.shape[0]),
              xaxis = dict(title = 'Release Year'),
              yaxis = dict(title = 'Movies'))

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [29]:
movie_titles.isna().sum()

Year    0
Name    0
dtype: int64

In [28]:
movie_titles = movie_titles.dropna(how='any')

## Rating Data

In [9]:
# Load single data-file
df_raw = pd.read_csv('D:/스파르타 코딩클럽/netflix_data/combined_data_1.txt', header=None, names=['User', 'Rating', 'Date'], usecols=[0, 1, 2])


# Find empty rows to slice dataframe for each movie
tmp_movies = df_raw[df_raw['Rating'].isna()]['User'].reset_index()
movie_indices = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]

# Shift the movie_indices by one to get start and endpoints of all movies
shifted_movie_indices = deque(movie_indices)
shifted_movie_indices.rotate(-1)


# Gather all dataframes
user_data = []

# Iterate over all movies
for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_indices, shifted_movie_indices):
    
    # Check if it is the last movie in the file
    if df_id_1<df_id_2:
        tmp_df = df_raw.loc[df_id_1+1:df_id_2-1].copy()
    else:
        tmp_df = df_raw.loc[df_id_1+1:].copy()
        
    # Create movie_id column
    tmp_df['Movie'] = movie_id
    
    # Append dataframe to list
    user_data.append(tmp_df)

# Combine all dataframes
df = pd.concat(user_data)
del user_data, df_raw, tmp_movies, tmp_df, shifted_movie_indices, movie_indices, df_id_1, movie_id, df_id_2, next_movie_id
print('Shape User-Ratings:\t{}'.format(df.shape))
df.sample(5)

Shape User-Ratings:	(24053764, 4)


Unnamed: 0,User,Rating,Date,Movie
9148367,530789,4.0,2004-12-14,1803
18178020,2297673,2.0,2005-04-08,3457
19576194,2556957,4.0,2005-05-18,3730
2315009,2321082,5.0,2004-03-30,443
23229915,1186032,4.0,2003-08-19,4356


In [24]:
# Get data
data = df['Rating'].value_counts().sort_index(ascending=False)

# Create trace
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#FFFFFF'),
               y = data.values,
               marker = dict(color = '#D941C5'))
# Create layout
layout = dict(title = '평점 분포 ({})'.format(df.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

## mongodb에 저장

In [31]:
json_movie = movie_titles.to_json('movies_title.json')