# Lesson 1: Introduction

## numpy

In [1]:
import numpy as np

'''
The following code is to help you play with Numpy, which is a library 
that provides functions that are especially useful when you have to
work with large arrays and matrices of numeric data, like doing 
matrix matrix multiplications. Also, Numpy is battle tested and 
optimized so that it runs fast, much faster than if you were working
with Python lists directly.
'''

'''
The array object class is the foundation of Numpy, and Numpy arrays are like
lists in Python, except that every thing inside an array must be of the
same type, like int or float.
'''
# Change False to True to see Numpy arrays in action
if True:
    array = np.array([1, 4, 5, 8], float)
    print array
    print ""
    array = np.array([[1, 2, 3], [4, 5, 6]], float)  # a 2D array/Matrix
    print array

'''
You can index, slice, and manipulate a Numpy array much like you would with a
a Python list.
'''
# Change False to True to see array indexing and slicing in action
if True:
    array = np.array([1, 4, 5, 8], float)
    print array
    print ""
    print array[1]
    print ""
    print array[:2]
    print ""
    array[1] = 5.0
    print array[1]

# Change False to True to see Matrix indexing and slicing in action
if True:
    two_D_array = np.array([[1, 2, 3], [4, 5, 6]], float)
    print two_D_array
    print ""
    print two_D_array[1][1]
    print ""
    print two_D_array[1, :]
    print ""
    print two_D_array[:, 2]

'''
Here are some arithmetic operations that you can do with Numpy arrays
'''
# Change False to True to see Array arithmetics in action
if True:
    array_1 = np.array([1, 2, 3], float)
    array_2 = np.array([5, 2, 6], float)
    print array_1 + array_2
    print ""
    print array_1 - array_2
    print ""
    print array_1 * array_2

# Change False to True to see Matrix arithmetics in action
if True:
    array_1 = np.array([[1, 2], [3, 4]], float)
    array_2 = np.array([[5, 6], [7, 8]], float)
    print array_1 + array_2
    print ""
    print array_1 - array_2
    print ""
    print array_1 * array_2

'''
In addition to the standard arthimetic operations, Numpy also has a range of
other mathematical operations that you can apply to Numpy arrays, such as
mean and dot product.

Both of these functions will be useful in later programming quizzes.
'''
if True:
    array_1 = np.array([1, 2, 3], float)
    array_2 = np.array([[6], [7], [8]], float)
    print np.mean(array_1)
    print np.mean(array_2)
    print ""
    print np.dot(array_1, array_2)


[ 1.  4.  5.  8.]

[[ 1.  2.  3.]
 [ 4.  5.  6.]]
[ 1.  4.  5.  8.]

4.0

[ 1.  4.]

5.0
[[ 1.  2.  3.]
 [ 4.  5.  6.]]

5.0

[ 4.  5.  6.]

[ 3.  6.]
[ 6.  4.  9.]

[-4.  0. -3.]

[  5.   4.  18.]
[[  6.   8.]
 [ 10.  12.]]

[[-4. -4.]
 [-4. -4.]]

[[  5.  12.]
 [ 21.  32.]]
2.0
7.0

[ 44.]


##pandas

In [2]:
import pandas as pd

'''
The following code is to help you play with the concept of Series in Pandas.

You can think of Series as an one-dimensional object that is similar to
an array, list, or column in a database. By default, it will assign an
index label to each item in the Series ranging from 0 to N, where N is
the number of items in the Series minus one.

Please feel free to play around with the concept of Series and see what it does

*This playground is inspired by Greg Reda's post on Intro to Pandas Data Structures:
http://www.gregreda.com/2013/10/26/intro-to-pandas-data-structures/
'''
# Change False to True to create a Series object
if True:
    series = pd.Series(['Dave', 'Cheng-Han', 'Udacity', 42, -1789710578])
    print series

'''
You can also manually assign indices to the items in the Series when
creating the series
'''

# Change False to True to see custom index in action
if True:
    series = pd.Series(['Dave', 'Cheng-Han', 359, 9001],
                       index=['Instructor', 'Curriculum Manager',
                              'Course Number', 'Power Level'])
    print series

'''
You can use index to select specific items from the Series
'''
# Change False to True to see Series indexing in action
if True:
    series = pd.Series(['Dave', 'Cheng-Han', 359, 9001],
                       index=['Instructor', 'Curriculum Manager',
                              'Course Number', 'Power Level'])
    print series['Instructor']
    print ""
    print series[['Instructor', 'Curriculum Manager', 'Course Number']]

'''
You can also use boolean operators to select specific items from the Series
'''
# Change False to True to see boolean indexing in action
if True:
    cuteness = pd.Series([1, 2, 3, 4, 5], index=['Cockroach', 'Fish', 'Mini Pig',
                                                 'Puppy', 'Kitten'])
    print cuteness > 3
    print ""
    print cuteness[cuteness > 3]


0           Dave
1      Cheng-Han
2        Udacity
3             42
4    -1789710578
dtype: object
Instructor                 Dave
Curriculum Manager    Cheng-Han
Course Number               359
Power Level                9001
dtype: object
Dave

Instructor                 Dave
Curriculum Manager    Cheng-Han
Course Number               359
dtype: object
Cockroach    False
Fish         False
Mini Pig     False
Puppy         True
Kitten        True
dtype: bool

Puppy     4
Kitten    5
dtype: int64


In [3]:
from pandas import DataFrame, Series

#################
# Syntax Reminder:
#
# The following code would create a two-column pandas DataFrame
# named df with columns labeled 'name' and 'age':
#
# people = ['Sarah', 'Mike', 'Chrisna']
# ages  =  [28, 32, 25]
# df = DataFrame({'name' : Series(people),
#                 'age'  : Series(ages)})

def create_dataframe():
    '''
    Create a pandas dataframe called 'olympic_medal_counts_df' containing
    the data from the table of 2014 Sochi winter olympics medal counts.  

    The columns for this dataframe should be called 
    'country_name', 'gold', 'silver', and 'bronze'.  

    There is no need to  specify row indexes for this dataframe 
    (in this case, the rows will automatically be assigned numbered indexes).
    
    You do not need to call the function in your code when running it in the
    browser - the grader will do that automatically when you submit or test it.
    '''

    countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
                 'Netherlands', 'Germany', 'Switzerland', 'Belarus',
                 'Austria', 'France', 'Poland', 'China', 'Korea', 
                 'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
                 'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
                 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']

    gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
    bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]

    # your code here
    olympic_medal_counts_df = DataFrame({'country_name':countries,
                                        'gold':gold,
                                        'silver':silver,
                                        'bronze':bronze})
    return olympic_medal_counts_df

olympic_medal_counts_df = create_dataframe()
print olympic_medal_counts_df

    bronze    country_name  gold  silver
0        9    Russian Fed.    13      11
1       10          Norway    11       5
2        5          Canada    10      10
3       12   United States     9       7
4        9     Netherlands     8       7
5        5         Germany     8       6
6        2     Switzerland     6       3
7        1         Belarus     5       0
8        5         Austria     4       8
9        7          France     4       4
10       1          Poland     4       1
11       2           China     3       4
12       2           Korea     3       3
13       6          Sweden     2       7
14       2  Czech Republic     2       4
15       4        Slovenia     2       2
16       3           Japan     1       4
17       1         Finland     1       3
18       2   Great Britain     1       1
19       1         Ukraine     1       0
20       0        Slovakia     1       0
21       6           Italy     0       2
22       2          Latvia     0       2
23       1      

### Average bronze medals

In [4]:
from pandas import DataFrame, Series
import numpy


def avg_medal_count():
    '''
    Compute the average number of bronze medals earned by countries who 
    earned at least one gold medal.  
    
    Save this to a variable named avg_bronze_at_least_one_gold. You do not
    need to call the function in your code when running it in the browser -
    the grader will do that automatically when you submit or test it.
    
    HINT-1:
    You can retrieve all of the values of a Pandas column from a 
    data frame, "df", as follows:
    df['column_name']
    
    HINT-2:
    The numpy.mean function can accept as an argument a single
    Pandas column. 
    
    For example, numpy.mean(df["col_name"]) would return the 
    mean of the values located in "col_name" of a dataframe df.
    '''


    countries = ['Russian Fed.', 'Norway', 'Canada', 'United States',
                 'Netherlands', 'Germany', 'Switzerland', 'Belarus',
                 'Austria', 'France', 'Poland', 'China', 'Korea', 
                 'Sweden', 'Czech Republic', 'Slovenia', 'Japan',
                 'Finland', 'Great Britain', 'Ukraine', 'Slovakia',
                 'Italy', 'Latvia', 'Australia', 'Croatia', 'Kazakhstan']

    gold = [13, 11, 10, 9, 8, 8, 6, 5, 4, 4, 4, 3, 3, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
    silver = [11, 5, 10, 7, 7, 6, 3, 0, 8, 4, 1, 4, 3, 7, 4, 2, 4, 3, 1, 0, 0, 2, 2, 2, 1, 0]
    bronze = [9, 10, 5, 12, 9, 5, 2, 1, 5, 7, 1, 2, 2, 6, 2, 4, 3, 1, 2, 1, 0, 6, 2, 1, 0, 1]
    
    olympic_medal_counts = {'country_name':Series(countries),
                            'gold': Series(gold),
                            'silver': Series(silver),
                            'bronze': Series(bronze)}
    df = DataFrame(olympic_medal_counts)
    
    # YOUR CODE HERE
    avg_bronze_at_least_one_gold = numpy.mean(df['bronze'][df.gold>0])
    return avg_bronze_at_least_one_gold

print avg_medal_count()

4.2380952381


##Problem Set1: Titanic Suvivor

* Predict by gender only
* More complex prediction

In [10]:
import numpy
import pandas
import statsmodels.api as sm
from __future__ import division

def simple_heuristic(file_path):
    correct = 0
    df = pandas.read_csv(file_path)
    for passenger_index, passenger in df.iterrows():
        predict = None
        if passenger['Sex'] == 'male':
            predict = 0
        else:
            predict = 1
            
        if predict == passenger['Survived']:
            correct+=1
            
    accuracy = correct/len(df)*100
    print "Accuracy:{}%".format(accuracy)
    
simple_heuristic(file_path = "data/titanic_data.csv")

Accuracy:78.6756453423%


In [11]:
'''
Here's the algorithm, predict the passenger survived if:
    1) If the passenger is female or
    2) if his/her socioeconomic status is high AND if the passenger is under 18
'''
def complex_heuristic(file_path):
    correct = 0
    df = pandas.read_csv(file_path)
    for passenger_index, passenger in df.iterrows():
        predict = None
        if passenger['Sex'] == 'female' or (passenger['Age']<18 and passenger['Pclass']==1):
            predict = 1
        else:
            predict = 0
            
        if predict == passenger['Survived']:
            correct+=1
            
    accuracy = correct/len(df)*100
    print "Accuracy:{}%".format(accuracy)
    
complex_heuristic(file_path = "data/titanic_data.csv")

Accuracy:79.1245791246%


In [12]:
df = pd.read_csv("data/titanic_data.csv")


In [16]:
from collections import Counter
sex_survive = [tuple([df.iloc[i]['Sex'],df.iloc[i]['Survived']]) for i in df.index]
sex_survive_counter = Counter(sex_survive)
sex_survive_counter

Counter({('male', 0): 468, ('female', 1): 233, ('male', 1): 109, ('female', 0): 81})

In [None]:


def custom_heuristic(file_path):
    correct = 0
    df = pandas.read_csv(file_path)
    for passenger_index, passenger in df.iterrows():
        predict = None
        if passenger['Sex'] == 'female' or (passenger['Age']<18 and passenger['Pclass']==1):
            predict = 1
        else:
            predict = 0
            
        if predict == passenger['Survived']:
            correct+=1
            
    accuracy = correct/len(df)*100
    print "Accuracy:{}%".format(accuracy)
    
custom_heuristic(file_path = "data/titanic_data.csv")