# Case Study: Movie Data Analysis

This notebook uses a dataset from the MovieLens website.

* Data SourceL MovieLens web site (filename: ml-20m.zip)
* Location https://grouplens.org/datasets/movielens


First Let's explore the folder to see what files we have in the dataset

In [None]:
!ls -la ./movielens

# Use Pandas to read data

In this notebook, we will be using three CSV files:

* ratings.csv: userId, movieId, rating, timestamp
* tags.csv: userId, movieId, tag, timestamp
* movies.csv: movieId, title, genres

In [None]:
import pandas as pd

In [None]:
movie_data = pd.read_csv('./movielens/movies.csv', sep=',')

In [None]:
movie_data.head()

In [None]:
#Timestamps represent seconds since midnight Coordinated Universal Time (UTC)
tags = pd.read_csv('./movielens/tags.csv', sep=",")
tags.head()

In [None]:
ratings = pd.read_csv('./movielens/ratings.csv', sep=",")
ratings.head()

### We do not need timestamp column right now but we will get back to it later on.

In [None]:
del ratings['timestamp']
del tags['timestamp']

# Descriptive Statistics

In [None]:
ratings['rating'].describe()

In [None]:
ratings['rating'].mean()

In [None]:
ratings['rating'].min(), ratings['rating'].max(), ratings['rating'].std()

In [None]:
ratings['rating'].mode()

## Quick sanity check

Here we are verifying adequacy of ratings by checking if there are any ratings greater than 5 or less than 0. It is unneccessary since we know min and max ratings values but this is an alternative way.

In [None]:
filter_1 = ratings['rating'] > 5

In [None]:
filter_1.any()

In [None]:
filter_2 = ratings['rating'] > 0

In [None]:
filter_2.all()

# Data Cleaning: Handling missing data

In [None]:
movie_data.shape

In [None]:
movie_data.isnull().any()

In [None]:
ratings.shape

In [None]:
ratings.isnull().any()

In [None]:
tags.shape

In [None]:
tags.isnull().any()

In [None]:
null_ = tags['tag'].isnull()
null_

In [None]:
null_tags = tags[null_]
null_tags

In [None]:
null_tag_movies = null_tags['movieId']
pd.DataFrame(movie_data, index=null_tag_movies)

### Since we have null values for tags, let's drop them to have a cleaner dataset

In [None]:
tags = tags.dropna()

In [None]:
tags.isnull().any()

In [None]:
tags.shape

### Extract year from title e.g (1995)

In [None]:
movie_data['year'] = movie_data['title'].str.extract(".*\((.*)\).*", expand=True)

In [None]:
movie_data.tail()

# Merge Data - average movie ratings over time


In [None]:
average_rating = ratings[['movieId', 'rating']].groupby('movieId', as_index=False).mean()
average_rating.tail()

In [None]:
joined = movie_data.merge(average_rating, on='movieId', how='inner')
joined.head()

In [None]:
joined.head()

## Specify Data to be used in Twitter API request

In [None]:
rating_mask = joined.rating  > 4
year_mask = joined.year > '2010'

In [None]:
final_data = joined[rating_mask & year_mask]
final_data = final_data.reset_index().drop(columns='index')

In [None]:
import numpy as np
titles = np.array(final_data.title)
for index, title in np.ndenumerate(titles):
    final_data.loc[index, 'title'] = title.split("(")[0]
final_data


In [None]:
import numpy as np
final_data = final_data.drop(final_data[final_data.title.str.contains(", The")].index, inplace = False)
final_data = final_data.reset_index().drop(columns='index')
titles = np.array(final_data.title)
for index, title in np.ndenumerate(titles):
    final_data.loc[index, 'hashtags'] = "#" +  title.replace(" ", "")

In [None]:
final_data

# Twitter API access

In [None]:
# Dependencies

import os
import tweepy as tw
import json
from pprint import pprint
import config

In [None]:
auth = tw.OAuthHandler(config.consumer_key, config.consumer_secret)
auth.set_access_token(config.access_token, config.access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)

In [None]:
# if not os.path.exists('secret_twitter_credentials.pkl'):
#     Twitter={}
#     Twitter['Consumer Key'] = config.consumer_key
#     Twitter['Consumer Secret'] = config.consumer_secret
#     Twitter['Access Token'] = config.access_token
#     Twitter['Access Token Secret'] = config.access_token_secret
#     with open('secret_twitter_credentials.pkl','wb') as f:
#         pickle.dump(Twitter, f)
# else:
#     Twitter=pickle.load(open('secret_twitter_credentials.pkl','rb'))

In [None]:
# import twitter

# auth = twitter.oauth.OAuth(Twitter['Access Token'],
#                            Twitter['Access Token Secret'],
#                            Twitter['Consumer Key'],
#                            Twitter['Consumer Secret'])

# twitter_api = twitter.Twitter(auth=auth)

### Where on Earth ID number.. this helps to find location based on ID

This was not needed since we found different way to pull data from twitter but still nice to have for future references


In [None]:
# WORLD_WOE_ID = 1
# US_WOE_ID = 23424977
# LOCAL_WOE_ID=2357024 # Atlanta WOEID

In [None]:
# world_trends = twitter_api.trends.place(_id=WORLD_WOE_ID)
# us_trends = twitter_api.trends.place(_id=US_WOE_ID)
# local_trends = twitter_api.trends.place(_id=LOCAL_WOE_ID)

In [None]:
def twitter_pull(hash):
    users_and_text = {}
    tweet_list = ''
    tweets = tw.Cursor(api.search,
        q=hash,
            lang="en").items(1)
    for tweet in tweets:
        users_and_text["hashtag"] = hash
        users_and_text["user_name"] = tweet.user.screen_name
        users_and_text["tweet"] = tweet.text
        users_and_text["location"] = tweet.user.location
#     users_and_text = [[tweet.user.screen_name, tweet.text, tweet.user.location] for tweet in tweets]
    return users_and_text
    

In [None]:
twits = []
for hash in final_data.hashtags:
    twits.append(twitter_pull(hash))

In [None]:
for tweet in twits:
    print(tweet)

In [None]:
for index, _dict in enumerate(twits):
    if bool(_dict):
        final_data.loc[index, 'user_name'] = _dict['user_name']
        final_data.loc[index, 'tweet'] = _dict['tweet']
        final_data.loc[index, 'location'] = _dict['location']
    else:
        final_data.loc[index, 'user_name'] = "Nan"
        final_data.loc[index, 'tweet'] = "Nan"
        final_data.loc[index, 'location'] = "Nan"
    

In [None]:
final_data = final_data.drop(final_data[final_data.tweet.str.contains("Nan")].index, inplace = False)

In [None]:
final_data

In [None]:
final_data