### Intro: find missing data

In [7]:
import pandas as pd
import numpy as np
import math

In [10]:
# Here's building a function using def: 

def square_root(x): 
    return math.sqrt(x)


In [20]:
# Here's building the same function using lambda
square_root = lambda x : math.sqrt(x)
square_root(169)

13.0

In [1]:
# Use the isnull() method to detect the missing values. The output
# shows True when the value is missing. By adding an index into 
# the dataset, you obtain just the entries that are missing.
# A dataset could represent missing data in several ways. In this 
# example, you see missing data represented as np.NaN (NumPy Not 
# a Number) and the Python None value.

### Intro: fill in missing data

In [2]:
# To fill in missing data use fillna(). For fillna() you need to 
# provide a number. Usually, the mean, median, or mode is used. 
# Let's use the same data set and this time let's fill in missing 
# values with the mean. 

In [3]:
# We could also just drop all the NAs, by using dropna()

### Write the equivalent lambda function

In [48]:
# Write the equivalent lambda function for the following def 
# function: 

def f (x): return x**2
print f(8)

64


In [21]:
# equivalent lambda function
l = lambda x: x**2
l(2)

4

### Lab 3.4

In [99]:
# read in the raw data from the github url

data = pd.read_csv("https://raw.githubusercontent.com/suneel0101/lesson-plan/master/crunchbase_monthly_export.csv")


In [100]:
print data.columns.values
data.head()

['permalink' 'name' 'homepage_url' 'category_list' ' market '
 ' funding_total_usd ' 'status' 'country_code' 'state_code' 'region' 'city'
 'funding_rounds' 'founded_at' 'founded_month' 'founded_quarter'
 'founded_year' 'first_funding_at' 'last_funding_at' 'Unnamed: 18']


Unnamed: 0,permalink,name,homepage_url,category_list,market,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,founded_month,founded_quarter,founded_year,first_funding_at,last_funding_at,Unnamed: 18
0,/organization/canal-do-credito,Canal do Credito,http://www.canaldocredito.com.br,|Credit|Technology|Services|Finance|,Credit,750000,,BRA,,Rio de Janeiro,Belo Horizonte,1,,,,,1/1/10,1/1/10,
1,/organization/waywire,#waywire,http://www.waywire.com,|Entertainment|Politics|Social Media|News|,Entertainment,1750000,acquired,USA,NY,New York City,New York,1,6/1/12,2012-06,2012-Q2,2012.0,6/30/12,6/30/12,
2,/organization/tv-communications,&TV Communications,http://enjoyandtv.com,|Games|,Games,4000000,operating,USA,CA,Los Angeles,Los Angeles,2,,,,,6/4/10,9/23/10,
3,/organization/rock-your-paper,'Rock' Your Paper,http://www.rockyourpaper.org,|Publishing|Education|,Education,40000,operating,EST,,Tallinn,Tallinn,1,10/26/12,2012-10,2012-Q4,2012.0,8/9/12,8/9/12,
4,/organization/in-touch-network,(In)Touch Network,http://www.InTouchNetwork.com,|Electronics|Guides|Coffee|Restaurants|Music|i...,Apps,1500000,operating,GBR,,London,London,1,4/1/11,2011-04,2011-Q2,2011.0,4/1/11,4/1/11,


In [101]:
# Renaming column names to remove spaces
data.rename(columns ={" market ":"market"," funding_total_usd ":"funding_total_usd"}, inplace=True)


In [142]:
# find missing values
# replace unwanted characters from funding total USD column
data["funding_total_usd"] = data["funding_total_usd"].replace({',':''}, regex=True)
data["funding_total_usd"] = data["funding_total_usd"].replace({' -   ':'0'}, regex=True)
data["funding_total_usd"] = data["funding_total_usd"].replace({' ':''}, regex=True)

# change data type from string to float
data["funding_total_usd"] = data["funding_total_usd"].astype(float)

# replace 0.0 to nan
data["funding_total_usd"][data["funding_total_usd"] == 0.0] = np.NaN


In [138]:
# fill in missing data with the mean
data["funding_total_usd"] = data["funding_total_usd"].fillna(round(np.mean(data["funding_total_usd"])))


In [145]:
# Zero nan values in funding_total_usd column
print sum(data["funding_total_usd"].isnull())
round(np.mean((data["funding_total_usd"])))

0


18005069.0

In [181]:
# funtion to check number of nan values in the dataframe
def num_missing(x):
    return sum(x.isnull())

data.apply(num_missing, axis=0)

permalink                0
name                     1
homepage_url          3036
category_list         3953
market                3953
funding_total_usd        0
status                2742
country_code             0
state_code           17177
region                5335
city                  4841
funding_rounds           0
founded_at            9658
founded_month         9724
founded_quarter       9724
founded_year          9724
first_funding_at         0
last_funding_at          0
dtype: int64

In [147]:
# drop last column
data.drop('Unnamed: 18', axis=1, inplace =True)

In [150]:
# check mode of country code column
from scipy.stats import mode
print mode(data['country_code']).mode[0]

# filling nan values in Country code column with Mode value
data['country_code']  = data['country_code'].fillna(mode(data['country_code']).mode[0])

'USA'

In [204]:
# pivot table
table = pd.pivot_table(data,index=["country_code"],aggfunc=np.sum,
                       values=["funding_total_usd","funding_rounds"]).sort_values(["funding_total_usd"], axis=0,  ascending=False)
table

Unnamed: 0_level_0,funding_rounds,funding_total_usd
country_code,Unnamed: 1_level_1,Unnamed: 2_level_1
USA,54586,6.217333e+11
CHN,1993,3.538789e+10
GBR,3778,2.834512e+10
CAN,1875,1.597638e+10
IND,1039,1.337640e+10
DEU,1284,1.242473e+10
RUS,537,9.785176e+09
ISR,1023,7.005875e+09
FRA,1163,6.998873e+09
JPN,590,5.563605e+09


In [200]:
# # pd.pivot_table(data,index=["country_code","market"])

# pd.pivot_table(data,index=["country_code"],columns=["status"],values=["funding_total_usd","funding_rounds","founded_year"],
#                aggfunc=[np.mean])

In [201]:
# pd.pivot_table(data,index=["market"],columns=["status"],values=["funding_total_usd","funding_rounds","founded_year"],
#                aggfunc=[np.mean])

In [183]:
data["country_code"][0]

'BRA'

In [202]:
# table1 = pd.pivot_table(data,index=["country_code","market"],columns=["status"],values=["funding_total_usd","funding_rounds","founded_year"],
#                aggfunc=[np.mean], fill_value = 0)
# table1.query('country_code == ["USA"]')

In [205]:
# pivot table
table = pd.pivot_table(data,index=["country_code", "market"],aggfunc=np.sum,
                       values=["funding_total_usd","funding_rounds"]).sort_values(["funding_total_usd"], axis=0, ascending=False)
table

Unnamed: 0_level_0,Unnamed: 1_level_0,funding_rounds,funding_total_usd
country_code,market,Unnamed: 2_level_1,Unnamed: 3_level_1
USA,Biotechnology,6573,6.872213e+10
USA,Mobile,1979,4.437554e+10
USA,Clean Technology,1734,3.364742e+10
USA,Software,4356,3.162788e+10
USA,Advertising,2618,1.880158e+10
USA,Enterprise Software,2046,1.818221e+10
USA,Health Care,1715,1.776095e+10
USA,Curated Web,2406,1.552197e+10
USA,E-Commerce,1828,1.415269e+10
USA,Analytics,1885,1.188475e+10
