# Case Study: Movie Data Analysis

In [None]:
!ls -la ./movielens

In [None]:
!cat ./movielens/movies.csv

In [None]:
!cat ./movielens/movies.csv | wc -l

In [None]:
!head -5 ./movielens/movies.csv

In [None]:
!head -5 ./movielens/ratings.csv

# Use Pandas to read Movies

In [None]:
import pandas as pd

In [None]:
movie_data = pd.read_csv('./movielens/movies.csv', sep=',')

In [None]:
movie_data.head()

In [None]:
tags = pd.read_csv('./movielens/tags.csv')
tags.head()

In [None]:
ratings = pd.read_csv('./movielens/ratings.csv')
ratings.head()

In [None]:
del ratings['timestamp']
del tags['timestamp']

# Data Structures

### Series

In [None]:
row_0 = tags.iloc[0]
row_0

In [None]:
row_0['userId']

In [None]:
'rating' in row_0

In [None]:
row_0.name

In [None]:
row_0.name = 'first_row'

In [None]:
row_0

# DataFrames

In [None]:
tags.head()

In [None]:
tags.index

In [None]:
tags.columns

In [None]:
tags.iloc[[0, 11, 2000]]

# Descriptive Statistics

In [None]:
ratings['rating'].describe()

In [None]:
ratings['rating'].mean()

In [None]:
ratings['rating'].min(), ratings['rating'].max(), ratings['rating'].std()

In [None]:
ratings['rating'].mode()

In [None]:
ratings.corr()

In [None]:
filter_1 = ratings['rating'] > 5

In [None]:
filter_1.any()

In [None]:
filter_2 = ratings['rating'] > 0

In [None]:
filter_2.all()

# Data Cleaning: Handling missing data

In [None]:
movie_data.shape

In [None]:
movie_data.isnull().any()

In [None]:
ratings.shape

In [None]:
ratings.isnull().any()

In [None]:
tags.shape

In [None]:
tags.isnull().any()

In [None]:
null_ = tags['tag'].isnull()
null_

In [None]:
null_tags = tags[null_]
null_tags

In [None]:
null_tag_movies = null_tags['movieId']
pd.DataFrame(movie_data, index=null_tag_movies)

In [None]:
tags = tags.dropna()

In [None]:
tags.isnull().any()

In [None]:
tags.shape

# Data Visualization

In [None]:
%matplotlib inline

ratings.hist(column='rating', figsize=(7.5, 5))


In [None]:
ratings.boxplot(column='rating', figsize=(7.5, 5))

# Slicing out columns

In [None]:
tags['tag'].head()

In [None]:
movie_data[['title', 'genres']].head()

In [None]:
tag_counts = tags['tag'].value_counts()
tag_counts[:10]

In [None]:
ratings.iloc[:10]

In [None]:
tag_counts[:10].plot(kind = 'bar', figsize = (7.5, 5))

# Filters for selected rows

In [None]:
is_highly_rated = ratings['rating'] >= 4.0

In [None]:
ratings[is_highly_rated][-5:]

In [None]:
is_animation = movie_data['genres'].str.contains('Animation')

In [None]:
movie_data[is_animation].head()

# Group by and Aggregate

In [None]:
ratings_count = ratings[['movieId', 'rating']].groupby('rating').count()
ratings_count

In [None]:
average_rating = ratings[['movieId', 'rating']].groupby('movieId').mean()
average_rating.head()

In [None]:
movie_count = ratings[['movieId', 'rating']].groupby('movieId').count()
movie_count.head()

# Merge DataFrames

In [None]:
tags.head()

In [None]:
movie_data.head()

In [None]:
merged_movie = movie_data.merge(tags, on='movieId', how='inner')
merged_movie

# Combine aggregation, merging and filters to get useful analytics

In [None]:
avg_ratings = ratings.groupby('movieId', as_index='False').mean()
del avg_ratings['userId']
avg_ratings.head()

In [None]:
box_office = movie_data.merge(avg_ratings, on='movieId', how='inner')
box_office.head()

In [None]:
is_highly_rated = box_office['rating'] >= 4.0
box_office[is_highly_rated][-5:]

In [None]:
is_comedy = box_office['genres'].str.contains('Comedy')
box_office[is_comedy][:5]

In [None]:
box_office[is_comedy & is_highly_rated][-5:]

# Vectorized String Operations

In [None]:
movie_data.head()

### Split 'genres' into multiple columns

In [None]:
movie_genres = movie_data['genres'].str.split('|', expand=True)

In [None]:
movie_genres[:10]

### Add a new column for comedy genre flag

In [None]:
movie_genres['isComedy'] = movie_data['genres'].str.contains('Comedy')

In [None]:
movie_genres[:10]

### Extract year from title e.g (1995)

In [None]:
movie_data['year'] = movie_data['title'].str.extract(".*\((.*)\).*", expand=True)

In [None]:
movie_data.tail()

# Parsing Timestamps

In [None]:
tags = pd.read_csv('./movielens/tags.csv', sep=',')

In [None]:
tags.dtypes

### Unix time/ POSIX time/ epoch time records time in seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970 

In [None]:
tags.head()

In [None]:
tags['parsed_time'] = pd.to_datetime(tags['timestamp'], unit='s')

In [None]:
tags.head()

In [None]:
tags['parsed_time'].dtypes

### Selecting rows based on timestamps

In [None]:
greater_than_t = tags['parsed_time'] > '2015-02-01'
selected_rows = tags[greater_than_t]

In [None]:
selected_rows.head()

In [None]:
tags.shape, selected_rows.shape

### Sorting data using timestamps

In [None]:
tags.sort_values(by='parsed_time')[:10]

In [None]:
selected_rows.sort_values(by='parsed_time')[:10]

# Average movie ratings over time

### Are movie ratings related to the year of launch

In [None]:
average_rating = ratings[['movieId', 'rating']].groupby('movieId', as_index=False).mean()
average_rating.tail()

In [None]:
joined = movie_data.merge(average_rating, on='movieId', how='inner')
joined.head()
joined.corr()

In [None]:
joined.head()

In [None]:
yearly_average = joined[['year', 'rating']].groupby('year', as_index=False).mean()
yearly_average[:10]

In [None]:
d = {'one' : pd.Series([100.,200.], index=['apple','orange']),
    'two' : pd.Series([111.,211], index=['apple','orange'])}
df = pd.DataFrame(d)

In [None]:
df

In [None]:
rating_mask = joined.rating  > 4
year_mask = joined.year > '2010'

In [None]:
test_joined = joined[rating_mask & year_mask]
test_joined = test_joined.reset_index()
test_joined

In [None]:
import numpy as np
titles = np.array(test_joined.title)
for index, title in np.ndenumerate(titles):
    test_joined.loc[index, 'hashtags'] = "#" + title.split("(")[0].replace(" ", "")
    

In [None]:
test_joined

In [None]:
movie_str = "Toy Story (1995)"

In [None]:
movie_str.split("(")[0]

# Twitter API access

In [None]:
# Dependencies

import pickle
import os
from pprint import pprint
import config

In [None]:
if not os.path.exists('secret_twitter_credentials.pkl'):
    Twitter={}
    Twitter['Consumer Key'] = config.consumer_key
    Twitter['Consumer Secret'] = config.consumer_secret
    Twitter['Access Token'] = config.access_token
    Twitter['Access Token Secret'] = config.access_token_secret
    with open('secret_twitter_credentials.pkl','wb') as f:
        pickle.dump(Twitter, f)
else:
    Twitter=pickle.load(open('secret_twitter_credentials.pkl','rb'))

In [None]:
import twitter

auth = twitter.oauth.OAuth(Twitter['Access Token'],
                           Twitter['Access Token Secret'],
                           Twitter['Consumer Key'],
                           Twitter['Consumer Secret'])

twitter_api = twitter.Twitter(auth=auth)

### Where on Earth ID number.. this helps to find location based on ID


In [None]:
WORLD_WOE_ID = 1
US_WOE_ID = 23424977
LOCAL_WOE_ID=2357024 # Atlanta WOEID

In [None]:
world_trends = twitter_api.trends.place(_id=WORLD_WOE_ID)
us_trends = twitter_api.trends.place(_id=US_WOE_ID)
local_trends = twitter_api.trends.place(_id=LOCAL_WOE_ID)

In [None]:
for hashtag in test_joined.hashtags:
    topic = hashtag
    number=1
    search_results = twitter_api.search.tweets(q=topic, count=number)
    statuses = search_results['statuses']