In [61]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as sts
import csv
import sqlite3 as sql
%matplotlib inline

In [48]:
#Pull HTML from UN website and parse
r = requests.get("http://web.archive.org/web/20110514112442/http://unstats.un.org/unsd/demographic/products/socind/education.htm")
soup = BeautifulSoup(r.content,'html.parser')

In [49]:
#Extract data at appropriate level in structure
urlData=[]
for i in soup('table')[6]('tr'):
    #The groups of interest have a length of 25, hence this filter to exclude items not needed
    if len(i) == 25:
        urlData.append(i)

In [50]:
#Manipulate HTML data into DataFrame
colList=['Country','Year','Total','Men','Women']
tableData = pd.DataFrame(columns=['Country','Year','Total','Men','Women'])

i=0
for entry in urlData:
    tableData.loc[i]=[int(entry('td')[v].get_text()) if v > 1 else entry('td')[v].get_text() for v in [0,1,4,7,10]]
    i+=1

#Set country as index and drop old column
tableData=tableData.set_index(tableData['Country'])
tableData=tableData.drop(['Country'],1)
tableData.head()

Unnamed: 0_level_0,Year,Total,Men,Women
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,2004,8.0,11.0,5.0
Albania,2004,11.0,11.0,11.0
Algeria,2005,13.0,13.0,13.0
Andorra,2008,11.0,11.0,12.0
Anguilla,2008,11.0,11.0,11.0


In [None]:
#Evaluate the data captured from the UN on school life
for col in colList[2::]:
    s,p = sts.normaltest(tableData[col])
    print('Mean School Life Expectancy for %s is %0.2f' % (col,tableData[col].mean()))
    print('Median School Life Expectancy for %s is %0.2f' % (col,tableData[col].median()))
    print('Normality Test P-Value for %s is %0.2f\n' % (col,p))
    
    plt.figure()
    plt.hist(tableData[col])
    plt.title(col)
    plt.xlabel('School Life Expectancy (years)')
    plt.ylabel('Frequency')

s,p = sts.ttest_ind(tableData['Men'],tableData['Women'])
print('Null Hypothesis: School Life Expectancy is equivalent for Men and Women regardless of country')
print('T-Test P-Value: %0.2f' % p)
if p > .05:
    print('Null Hypothesis Accepted')
else:
    print('Null Hypothesis REEEEjected!')


In [71]:
#Pull World Bank data into a DataFrame (shortcut)
fileName='API_NY.GDP.MKTP.CD_DS2_en_csv_v2.csv'
moneyData=pd.read_csv(fileName,header=2,usecols=['Country Name','1999','2000','2001','2002','2003','2004','2005','2006','2007','2008','2009','2010'])
moneyData=moneyData.set_index(moneyData['Country Name'])
moneyData=moneyData.drop('Country Name',1)

#Create database
con = sql.connect('WorldBankData.db')
cur = con.cursor()

#Create function to deal with SQLite weirdness about Unicode & text_factory
con.text_factory = lambda x: unicode(x,'utf-8','ignore')

#Push moneyData DataFrame into the database
cur.execute('DROP TABLE IF EXISTS gdp;')
moneyData.to_sql('gdp',con,flavor='sqlite')

#Push tableData DataFrame into the database
cur.execute('DROP TABLE IF EXISTS education;')
tableData.to_sql('education',con,flavor='sqlite')