### How we can handle missing Values: 

In [2]:
import numpy as np
import pandas as pd
from __future__ import division
pd.set_option('display.width',5000)

In [28]:
import requests
url = 'http://www.boxofficemojo.com/movies/?id=monstersinc2.htm'
response = requests.get(url)
page = response.text

from bs4 import BeautifulSoup
soup=BeautifulSoup(page)

import re

def get_movie_value(soup, field_name):
    obj = soup.find(text=re.compile(field_name))
    if not obj: 
        return None
    # this works for most of the values
    next_sibling = obj.findNextSibling()
    if next_sibling:
        return next_sibling.text 
    else:
        return None
    
budget=get_movie_value(soup,'Production Budget')
print budget

N/A


In [3]:
data=pd.read_csv('/Users/username/nyc16_ds8/challenges/challenges_data/2013_movies.csv')

In [4]:
data[5:10]

Unnamed: 0,Title,Budget,DomesticTotalGross,Director,Rating,Runtime,ReleaseDate
5,Gravity,100000000,274092705,Alfonso Cuaron,PG-13,91,10/4/13 0:00
6,Monsters University,"""N/A""",268492764,Dan Scanlon,G,107,6/21/13 0:00
7,The Hobbit: The Desolation of Smaug,"""N/A""",258366855,Peter Jackson,PG-13,161,12/13/13 0:00
8,Fast & Furious 6,160000000,238679850,Justin Lin,PG-13,130,5/24/13 0:00
9,Oz The Great and Powerful,215000000,234911825,Sam Raimi,PG,127,3/8/13 0:00


In [5]:
data.Budget=data.Budget.replace('"N/A"',np.nan)

# because there were strings. . entire column was read as string
data.Budget=data.Budget.astype(float)
data[5:10]

Unnamed: 0,Title,Budget,DomesticTotalGross,Director,Rating,Runtime,ReleaseDate
5,Gravity,100000000.0,274092705,Alfonso Cuaron,PG-13,91,10/4/13 0:00
6,Monsters University,,268492764,Dan Scanlon,G,107,6/21/13 0:00
7,The Hobbit: The Desolation of Smaug,,258366855,Peter Jackson,PG-13,161,12/13/13 0:00
8,Fast & Furious 6,160000000.0,238679850,Justin Lin,PG-13,130,5/24/13 0:00
9,Oz The Great and Powerful,215000000.0,234911825,Sam Raimi,PG,127,3/8/13 0:00


In [6]:
data.describe()

Unnamed: 0,Budget,DomesticTotalGross,Runtime
count,89.0,100.0,100.0
mean,74747190.0,100596900.0,112.26
std,59416920.0,87396410.0,18.190696
min,2500000.0,25568250.0,75.0
25%,28000000.0,42704130.0,98.0
50%,55000000.0,69542370.0,112.0
75%,110000000.0,120475900.0,123.0
max,225000000.0,424668000.0,180.0


In [7]:
# Which columns have missing values ? 
# what percentage is missing ? 
for i in data.columns:
    print i, (data[i].isnull().sum())/len(data)

Title 0.0
Budget 0.11
DomesticTotalGross 0.0
Director 0.04
Rating 0.0
Runtime 0.0
ReleaseDate 0.0


In [8]:
# Fillna ??  we could but,

# data.Budget=data.Budget.fillna(data.Budget.mean())


In [9]:
# Explore patterns before filling data ! 

data_gb=data.groupby(['Rating']).sum()
data_gb.head()

Unnamed: 0_level_0,Budget,DomesticTotalGross,Runtime
Rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
G,,268492764,107
PG,1409000000.0,1967035741,1499
PG-13,3961000000.0,5238139144,5523
R,1282500000.0,2586019873,4097


In [10]:
# Some algorithms require that we infer missing values ..
# We can use Regression for this

data['Intercept']=np.ones(len(data))

y_train=data.Budget[data.Budget.isnull()==False]
# what we're trying to predict
y_test=data.Budget[data.Budget.isnull()==True]

x_train=data[['Intercept','DomesticTotalGross','Runtime']][data.Budget.isnull()==False]
x_test=data[['Intercept','DomesticTotalGross','Runtime']][data.Budget.isnull()==True]



In [12]:
data2=data
import statsmodels.formula.api as sm
model = sm.OLS(y_train,x_train )
results = model.fit()

for i in y_test.index:
    data2.Budget.loc[i]=results.predict(x_test.loc[i])  
    


In Summary : 

1)  Avoid deleting your entire row of data.  
2)  We can impute the missing value using different algorithms:  
(Regression, KNN).  If we choose to impute:  
    - it's important to be sure that your data is 'truly missing at random'.   
    - The value we are inferring is not really adding anything to the model  
3)  If your algorithm permits missing data, then go with it!  

In [None]:
# Other packages: 

# https://pypi.python.org/pypi/fancyimpute/0.0.4

