In [None]:
import numpy as np

In [None]:
x = np.arange(9)
y = np.arange(9)

In [None]:
x * y

In [None]:
# Matrix operation dot product
x @ y

In [None]:
x1 = x.reshape(3,3)
x2 = x.reshape(9,1)

In [None]:
x

In [None]:
x1

In [None]:
x2

## Challange

1. Create a 3x3 array containing the numbers 0-8
2. Multiply the array by itself (elementwise)
3. Multiply the array by its transpose
4. Divide the array by itself

In [None]:
c1 = np.arange(9).reshape(3,3)
c1

In [None]:
c1*c1

In [None]:
np.transpose(c1)

In [None]:
c1 * np.transpose(c1)

In [None]:
c1 / c1

In [None]:
# Can also do transpose as this
c1 * c1.T

# Working with data in Pandas

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv("gapminder_gdp_oceania.csv", index_col = "country")
data

In [None]:
# Good advice about python
import this

In [None]:
data.index

In [None]:
data.columns

In [None]:
data.shape

In [None]:
# Data frames can tell you about their contents
data.info()

In [None]:
# Get descriptive statistics
data.describe()

In [None]:
# Get the first N rows
data.head(1)

In [None]:
data.T.describe()

In [None]:
data

In [None]:
data.T

##  Subsetting data

In [None]:
data_europe = pd.read_csv("gapminder_gdp_europe.csv", index_col = "country")
data_europe.head(3)

In [None]:
# Get item by index location
data_europe.iloc[0,0]

In [None]:
# Get item by label
data_europe.loc["Albania","gdpPercap_1952"]

## A brief digression on string methods

In [None]:
# Standard python has string methods
big_hello = "hello".title()
big_hello

In [None]:
help("hello".title)

In [None]:
# What methods are available
dir("hello")

In [None]:
data_europe.columns

In [None]:
dir(data_europe.columns.str)

In [None]:
data_europe.columns

In [None]:
# Rewrite column headers to strip off leading text
data_europe.columns = data_europe.columns.str.strip("gdpPercap_")
data_europe.columns

In [None]:
data_europe.loc["Albania", "1952"]

In [None]:
europe_subset = data_europe.loc["Italy":"Poland","1962":"1972"]
europe_subset

In [None]:
#Get all of the variables for a subset of rows
data_europe.loc["Italy":"Poland", :]

In [None]:
europe_subset.max()

## Challange
1. Calculate subset.max() and assign the results to a variable. What kind of thing is it?
2. What is the maximum value of the new thing?
3. Can you calculate the max value of the new thing in 1 step?

In [None]:
subset_max = europe_subset.max()
subset_max.info()

In [None]:
max(subset_max)

In [None]:
max(europe_subset.max())

In [None]:
europe_subset.max().max()

In [None]:
type(europe_subset)

In [None]:
type(subset_max)

In [None]:
europe_subset

In [None]:
europe_subset.max(axis = 0)

In [None]:
europe_subset.max(axis = 1)

In [None]:
# Get max for overall data frame
europe_subset.max(axis = None)

# Filter data by criterion


In [None]:
europe_subset

In [None]:
# Get GDPs greater than 10,0000
europe_subset > 10000

In [None]:
# Filter the data frame by the criterion
df = europe_subset[europe_subset > 10000]
df

In [None]:
# Get everything above the median
europe_subset[europe_subset > europe_subset.median(axis = None)]

In [None]:
europe_subset > europe_subset.median(axis = None)

# Working with missing data

In [None]:
# Missing data is ignored by default

In [None]:
print("Column Means")
df.mean()

In [None]:
print("Row Means")
df.mean(axis = 1)

In [None]:
# Check for missing values
df.isna()

In [None]:
# Count our missing values
df.isna().sum().sum()

In [None]:
df.isna()

In [None]:
df.count()

In [None]:
df.isna() == True

In [None]:
# Drop missing values
df.dropna()

In [None]:
# Replace missing values with a fixed value
df_fixed = df.fillna(99)
df_fixed

In [None]:
# Replace missing values with interpolated data
df_interpolated = df.interpolate()
df_interpolated

## Challange: The perils of missing data
1. Create an array of random numbers atching the "data" dataframe

random_filter = np.random.rand(30,12) * data_europe.max().max()

2. Create a new data frame that filters out all the values lower than the random numbers
3. Interpolate new values for the missing data. How accurate are they?


In [None]:
random_filter = np.random.rand(30,12) * data_europe.max().max()

In [None]:
random_filter

In [None]:
data_filtered = data_europe[data_europe > random_filter]

In [None]:
data_filtered

In [None]:
df_interpolated_random = data_filtered.interpolate()
df_interpolated_random

In [None]:
# percent error
abs((data_europe - df_interpolated_random)/((data_europe)))*100

In [None]:
data_europe.describe()

In [None]:
df_interpolated_random.describe()

# Sorting and grouping data

In [None]:
# Z score all of the GDP data points
z = (data_europe - data_europe.mean().mean())/data_europe.values.std(ddof = 1)
z

In [None]:
mean_z = z.mean(axis = 1)
mean_z.sort_values()

In [None]:
bool_z = mean_z > 0
bool_z

In [None]:
# Add new columns to data frame
data_europe["mean_z"] = mean_z
data_europe["wealthy"] = bool_z

In [None]:
data_europe

In [None]:
# Get descriptive statistic for groups
data_europe.groupby("wealthy").mean()

In [None]:
data_europe.groupby("wealthy").describe()

In [None]:
# Write output
data_europe.to_csv("europe_normed.csv")