## Project 6

In [1]:
import os
import subprocess
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
import scipy
import requests
from imdbpie import Imdb
import nltk
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt
import urllib
from bs4 import BeautifulSoup
import collections
from collections import Counter
import re
import csv
import psycopg2

%matplotlib inline

### Pre-Work: Write a problem statement 

## Part 1: Acquire the Data

#### 1. Connect to the IMDB API

In [2]:
imdb = Imdb()
imdb = Imdb(anonymize=True)

#### 2. Query the top 250 rated movies in the database

In [3]:
top250 = imdb.top_250()

In [4]:
top250 = pd.DataFrame(top250)

#### 4. Write the Results to a csv

In [5]:
top250.head()

Unnamed: 0,can_rate,image,num_votes,rating,tconst,title,type,year
0,True,{u'url': u'http://ia.media-imdb.com/images/M/M...,1677125,9.3,tt0111161,The Shawshank Redemption,feature,1994
1,True,{u'url': u'http://ia.media-imdb.com/images/M/M...,1147488,9.2,tt0068646,The Godfather,feature,1972
2,True,{u'url': u'http://ia.media-imdb.com/images/M/M...,784568,9.0,tt0071562,The Godfather: Part II,feature,1974
3,True,{u'url': u'http://ia.media-imdb.com/images/M/M...,1662425,9.0,tt0468569,The Dark Knight,feature,2008
4,True,{u'url': u'http://ia.media-imdb.com/images/M/M...,858289,8.9,tt0108052,Schindler's List,feature,1993


In [6]:
top250.drop(['can_rate', 'image', 'type'], axis =1 ,inplace=True)

## Part 2: Wrangle the text data

#### 2. Scrape the reviews for the top 25 movies

*Hint*: Use a loop to scrape each page at once

In [7]:
movieID = top250.tconst.values.tolist()

In [8]:
reviews = []
ids = []
for x in movieID:
    review = imdb.get_title_reviews(x, max_results=15)
    for i in review:
        ids.append(x)
        reviews.append(i.text)

#### 3. Work through each title and find the most common descriptors

In [9]:
reviewData = pd.DataFrame({"movieID": ids, "reviews": reviews})

In [35]:
reviewData.head()

Unnamed: 0,movieID,reviews
0,tt0111161,Why do I want to write the 234th comment on Th...
1,tt0111161,"\nCan Hollywood, usually creating things for e..."
2,tt0111161,\nI have never seen such an amazing film since...
3,tt0111161,"In its Oscar year, Shawshank Redemption (writt..."
4,tt0111161,The reason I became a member of this database ...


In [15]:
#tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
tokens = [nltk.word_tokenize(review) for review in reviews]

In [16]:
posTokens = [nltk.tag.pos_tag(token) for token in tokens]

In [17]:
adjList = []
for x in posTokens:
    # each x is either a list of (word, POS tag) tuples
    for word, pos in x:
        if pos in ['JJ', 'JJS', 'JJR']: # feel free to add any other tags you may be looking for
            adjList.append(word)

In [18]:
commonAdj= [a for a, b in Counter(adjList).most_common(50)]

In [19]:
dfCommonAdj = pd.DataFrame(columns=commonAdj)
dfCommonAdj

Unnamed: 0,great,best,other,good,many,first,more,such,most,own,...,main,final,funny,important,full,later,emotional,simple,entire,memorable


In [20]:
reviewDataCopy = pd.DataFrame(reviewData)
reviewDataCopy = reviewDataCopy.join(dfCommonAdj)

In [21]:
for c, col in enumerate(reviewDataCopy.columns[2:]):
    for r, row in enumerate(reviewDataCopy.index):
        reviewLower = reviewDataCopy.loc[row,"reviews"].lower()
        if (col in reviewLower):
            reviewDataCopy.loc[row,col] = 1
        else:
            reviewDataCopy.loc[row,col] = 0

Unnamed: 0,movieID,reviews,great,best,other,good,many,first,more,such,...,main,final,funny,important,full,later,emotional,simple,entire,memorable
0,tt0111161,Why do I want to write the 234th comment on Th...,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,tt0111161,"\nCan Hollywood, usually creating things for e...",1,1,1,0,1,0,1,1,...,1,0,0,1,1,0,1,0,0,0
2,tt0111161,\nI have never seen such an amazing film since...,1,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
3,tt0111161,"In its Oscar year, Shawshank Redemption (writt...",0,1,1,1,1,1,1,1,...,0,1,0,0,0,0,0,1,0,0
4,tt0111161,The reason I became a member of this database ...,1,0,0,0,0,1,0,0,...,1,1,0,0,0,0,0,0,1,0
5,tt0111161,\nI believe that this film is the best story e...,1,1,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6,tt0111161,\nOne of my all time favorites. Shawshank Rede...,1,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1
7,tt0111161,\nOne of the finest films made in recent years...,1,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
8,tt0111161,Misery and Stand By Me were the best adaptatio...,1,1,0,1,0,1,1,0,...,0,0,0,0,1,0,0,0,0,0
9,tt0111161,\nThe Shawshank Redemption is without a doubt ...,1,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
reviewDataCopy.drop(['reviews'], axis =1 ,inplace=True)

In [23]:
reviewDataCopy.head()

Unnamed: 0,movieID,great,best,other,good,many,first,more,such,most,...,main,final,funny,important,full,later,emotional,simple,entire,memorable
0,tt0111161,0,1,1,0,1,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,tt0111161,1,1,1,0,1,0,1,1,1,...,1,0,0,1,1,0,1,0,0,0
2,tt0111161,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,1,0,0
3,tt0111161,0,1,1,1,1,1,1,1,1,...,0,1,0,0,0,0,0,1,0,0
4,tt0111161,1,0,0,0,0,1,0,0,0,...,1,1,0,0,0,0,0,0,1,0


In [29]:
reviewDataCopy = reviewDataCopy.groupby(["movieID"], group_keys=False, as_index=False).apply(lambda x: x.iloc[:,1:].max())

In [25]:
top250=top250.rename(columns = {'tconst':'movieID'})
top250.head()

Unnamed: 0,num_votes,rating,movieID,title,year
0,1677125,9.3,tt0111161,The Shawshank Redemption,1994
1,1147488,9.2,tt0068646,The Godfather,1972
2,784568,9.0,tt0071562,The Godfather: Part II,1974
3,1662425,9.0,tt0468569,The Dark Knight,2008
4,858289,8.9,tt0108052,Schindler's List,1993


In [26]:
top250 = top250[['movieID', 'year','title', 'rating', 'num_votes']]
top250.head()

Unnamed: 0,movieID,year,title,rating,num_votes
0,tt0111161,1994,The Shawshank Redemption,9.3,1677125
1,tt0068646,1972,The Godfather,9.2,1147488
2,tt0071562,1974,The Godfather: Part II,9.0,784568
3,tt0468569,2008,The Dark Knight,9.0,1662425
4,tt0108052,1993,Schindler's List,8.9,858289


In [31]:
moviesDf = top250.join(reviewDataCopy)

In [33]:
top250.to_csv('../assets/06-project6-assets/data/top250.csv', encoding='utf8', index=False)

In [34]:
reviewDataCopy.to_csv('../assets/06-project6-assets/data/reviewData.csv', encoding='utf8', index=False)

In [None]:
moviesDf.to_csv('../assets/06-project6-assets/data/moviesDf.csv', encoding='utf8', index=False)

#### 9. Repeat the process for the other top 24 titles

## Part 3: Combine Tables in PostgreSQL

#### 1. Import your two .csv data files into your Postgre Database as two different tables

For ease, we can call these table1 and table2

#### 2. Connect to database and query the joined set

#### 3. Join the two tables 

#### 4. Select the newly joined table and save two copies of the into dataframes

## Part 4: Parsing and Exploratory Data Analysis

#### 1. Rename the column headings

#### 2. Run a description of the data

#### 3. Visualize the Data

## Part 3: Build the Decision Tree

#### 1. What is our target attribute? 

#### 2. Prepare the data and define the training set

#### 2. Train the Model

#### 3. Set up test data and test the model

#### 5. Check the results

#### 6. What is overfitting and how are we at risk? 