## Spatial Data Science (GEO6119)

---

# Lecture 3: Array and DataFrame II

<br>
Instructor: Yi Qiang (qiangy@usf.edu)<br>

___

# Recap of Last Class

## Numpy Array
- Array is a n-dimensional matrix that stores values of the same type.
- An array can be in different shapes and sizes

<img src="image/array.jpg" width="400" align="left">

# 1. Additional Array Manipulation
## 1.1 Array Transposing

<img src="image/wk3/transpose.png" width="300" align="left">


In [None]:
# Create a 3*5 array with random numbers
import numpy as np
a = np.random.rand(3,5)
a

In [None]:
# Transposing the array
a.T

In [None]:
# Transposing the array
np.transpose(a)

## 1.2 Changing Array Shape
### Flatten an Array

<img src="image/wk3/flatten.jpg" width="300" align="left">


In [None]:
# Print Array a
a

In [None]:
# Print the shape of a
a.shape

In [None]:
# Flatten a and store it in b
b = a.ravel()

# Print b
b

In [None]:
# Print the shape of b
b.shape

### Reshape an array

<img src="image/wk3/reshape.jpg" width="150" align="left">


In [None]:
# print a, which is a 3*5 array
a

In [None]:
# Reshape to 5*3 array
a.reshape(5,3)

## 1.3 Editing Elements in Array
### Edit element at a specific position

In [None]:
# Create a 4*4 array of random value
a = np.random.rand(4,4)
a

In [None]:
# Change the element in the 4th row and 3rd column to 1
a[3,2] = 1
a

### Edit a subset of array (slicing)

In [None]:
# Change all elements in the 3rd row to 2
a[2,:] = 2
a

In [None]:
# Change elements from the 1st to the 3rd row, and from the 3rd to the 5th column
a[0:3,2:5] = 3
a

## 1.4 Stacking Arrays

### Vertically stacking arrays

<img src="image/wk3/vstack.png" width="200" align="left">


In [None]:
# Create an 3*5 array of zeros
a = np.zeros((3,5))
a

In [None]:
# Create an 2*5 array of ones
b = np.ones((2,5))
b

In [None]:
# Vertically stack a and b
np.vstack((a,b))

### horizontally stack arrays

<img src="image/wk3/hstack.png" width="100" align="left">


In [None]:
a = np.zeros((3,5))
a

In [None]:
# Create a 3*2 array
c = np.ones((3,2))
c = c*3
c

In [None]:
# Horizontally stack a and c
np.hstack((a,c))

# 2. Pandas DataFrame
- Designed for spreadsheets and tables

<img src="image/dataframe.jpg" width="600" align="left">

In [None]:
import pandas as pd

# Creating a dataframe by rows
df1 = pd.DataFrame(
    [['Martha', 87, 83,'A'],
     ['Tim', 91, 99,'B'],
     ['Rob', 97, 84,'B'],
    ['Georgia',95, 76,'A']],
    columns=['Name', 'Math', 'Science','Class'])
df1

In [None]:
# Creating a dataframe by columns
dict = {'Name':['Martha', 'Tim', 'Rob', 'Georgia'],
        'Math':[87, 91, 97, 95],
        'Science':[83, 99, 84, 76],
        'Class': ['A','B','B','A']
       }

df1 = pd.DataFrame(dict)
df1

## 2.1 Dropping and adding rows and columns

### Drop a row

In [None]:
# Drop a row by index
df1.drop(0)

In [None]:
df1[df1.Name == 'Martha'].index

In [None]:
# Drop a row by query
df1.drop(df1[df1.Name == 'Martha'].index)

In [None]:
# Update the df1 after dropping the row
df1 = df1.drop(df1[df1.Name == 'Martha'].index)
df1

### Add a row

In [None]:
# Print df1
df1

In [None]:
# Append a row at the end of a DataFrame
dict = {'Name': 'Amy', 'Math': 89, 'Class':'A','Science':92}
df1.append(dict, ignore_index = True)

In [None]:
# update df1 after append
df1 = df1.append(dict, ignore_index = True)
df1

### Add a column

#### Add a column of the same value

In [None]:
df1['School'] = 'Mckitrick'
df1

In [None]:
# Use .loc function to update values in selected row and column
df1.loc[df1['Name'] == 'Tim','School'] = 'Hammond'
df1

### Drop a column

In [None]:
# Drop the Math column
df1.drop('Math',axis = 1)

In [None]:
# Update df1 for the changes
df1 = df1.drop(['Science','Class'],axis = 1)
df1

### Vertically concatenate DataFrames

<img src="image/wk3/v_concat.png" width="250" align="left">

In [None]:
df1

In [None]:
# Create a new dataframe
dict = {'Name':['Dave', 'Jack', 'Elena'],
        'Math':[79, 99, 87],
        'School': ['Hammond','Lutz','Lutz']
       }

df2 = pd.DataFrame(dict)
df2

In [None]:
# Concatenate df1 and df2
df1 = pd.concat([df1,df2]).reset_index(drop=True)
df1

### Horizontally concatenate DataFrames

<img src="image/wk3/h_concate.jpg" width="300" align="left">

In [None]:
# Create a single column dataframe
df3 = pd.DataFrame({'Science':[88,72,97,84,87,93,88]})
df3

In [None]:
# Horizontally concatenate df1 and df3
pd.concat([df1,df3],axis = 1)

## 2.3 More Manipulation of DataFrame

In [None]:
# Load county level population data
url = 'https://raw.githubusercontent.com/qiang-yi/spatial_data_science/main/other/county_pop.xlsx'
df_pop = pd.read_excel(url)

In [None]:
# Preview the dataframe
df_pop.head()

In [None]:
df_pop

In [None]:
# Change the setting to print full dataframe
pd.set_option("display.max_rows", 20, "display.max_columns", 20)

df_pop

### Remove missing values

In [None]:
df_pop.isna().any(axis=1)

In [None]:
# Get all rows containing NaN
df_pop[df_pop.isna().any(axis=1)]
#df_pop[~(df_pop.isna().any(axis=1))]

In [None]:
# Update df_pop to store the rows without NaN
df_pop = df_pop[~(df_pop.isna().any(axis=1))]

# df_pop.dropna(axis='row') also works

df_pop

In [None]:
# Select records of Hillsborough County
df_hill = df_pop[(df_pop['Area name'] == 'Orleans Parish') & (df_pop['State'] == 'LA')]
df_hill

In [None]:
# Get the last four column names and convert them to a list
col_ls = list(df_hill.columns)[-4:]
col_ls

In [None]:
# Use 'melt' to convert a wide dataframe to a long one
df_hill2 = df_hill.melt(value_vars = col_ls)
df_hill2

In [None]:
# Melt the dataframe with the variable and value names defined
df_hill2 = df_hill.melt(value_vars=col_ls,var_name='Year',value_name = 'Population')
df_hill2

In [None]:
# Remove Population in the Year column, and change the column to integer type

df_hill2['Year'].str.replace('Population ','').astype(int)

In [None]:
# Remove Population in the Year column, and change the column to integer type
df_hill2['Year'] = df_hill2['Year'].str.replace('Population ','').astype(int)

In [None]:
df_hill2.set_index('Year').plot()

In [None]:
df_hill2.set_index('Year').plot.bar()

### Grouping columns

In [None]:
# Group by the State column
df_pop.groupby(['State']).max()

In [None]:
df_pop.groupby(['Rural-urban continuum code 2013']).mean()