# Introduction of Python Packages

### Numpy
- Handling large, multi-dimensional arrays and matrices
- High-level mathematical functions
- https://numpy.org/

### Pandas
- Data manipulation and analysis - data table structure
- https://pandas.pydata.org/

### Matplotlib
- Plotting library
- https://matplotlib.org/

### Seaborn
- Data visulaization
- https://seaborn.pydata.org/#

# Understanding NumPy Arrays

### Importing a package

In [None]:
import numpy as np

### Creating arrays

In [None]:
a = np.array([2,4,6,8,1])
print(a)

### Creating matrix
- (2 X 5)
- (2 X 5 X 3)

In [None]:
b = np.array([[2,4,6,8,1],[5,7,8,3,2]]) #two-times square bracket 
print(b)

In [None]:
b

In [None]:
c = np.array([[[2,4,6,8,1],[5,7,8,3,2]],
             [[1,2,3,4,7],[3,5,7,7,8]],
             [[5,2,1,1,7],[8,2,2,3,2]]]) 
print(c)

### Creating arrays using function

In [None]:
p = np.zeros((3,3))
print(p)

In [None]:
p = np.ones((3,3))
print(p)

In [None]:
p = np.full((2,2),7)
print(p)

In [None]:
p = np.eye(4)
print(p)

In [None]:
t = np.random.random((3,3))
print(t)

### Selecting array elements

In [None]:
print(a)

In [None]:
print(a[0])
print(a[3])

In [None]:
print(b)

In [None]:
print(b[0,1])
print(b[1,4])

In [None]:
b[1,4] = 7
print(b)

### Operation

In [None]:
A = np.array([[1,-2],[3,5]])
B = np.array([[0,3],[2,2]])
print(A)
print(B)

In [None]:
print(A + B)

In [None]:
print(A * B)

In [None]:
print(A @ B)

In [None]:
print(np.abs(A))

In [None]:
print(np.sqrt(B))

In [None]:
print(np.sum(A))

In [None]:
print(np.sum(A, axis=0))

In [None]:
print(np.sum(A, axis=1))

In [None]:
print(np.sum(A))
print(A.sum())

In [None]:
print(np.sum(A, axis=0))
print(A.sum(axis=0))

# Understanding Pandas Dataframes

### Import pandas package

In [None]:
import pandas as pd

### Create dataframe

In [None]:
data = {'Name':['Kim','Jane','Kevin','Jin'],'Age':[23, 45, 46, 52]}

In [None]:
print(data)

In [None]:
df = pd.DataFrame(data)

In [None]:
df

# Data Import Using Pandas 
- csv file (comma-separated values)
- Go to LearnUS download the data "Pdata.csv" and "Pdata_description.xlsx" (location: Class Files - Personal trip data)
- Copy "Pdata.csv" in the folder of Jupyter Notebook Script
- Check "Pdata_description.xlsx" which includes the descriptions of Pdata

### Importing a csv file

In [None]:
Pdata = pd.read_csv("Pdata.csv")

In [None]:
Pdata

In [None]:
Pdata.head(10)

In [None]:
Pdata.tail(10)

In [None]:
Pdata.describe()

# Data Manupulating using Pandas

### Selecting a column

In [None]:
Trips = Pdata['NofTrips']
Trips

In [None]:
Pdata.NofTrips

### Create variables

In [None]:
# Create "Age"
Pdata["Age"] = 2016-Pdata["Byear"]
Pdata

Create age group: 
- (1) create a function for categorizing age, and 
- (2) use "apply()" method which applys a function along an axis of the DataFrame


In [None]:
#  Create age group: (1) create a function for categorizing age
def Age_category(row):
    if row['Age'] > 60:
        return '60s'
    elif row['Age'] > 50:
        return '50s'
    elif row['Age'] > 40:
        return '40s'
    elif row['Age'] > 30:
        return '30s'
    elif row['Age'] > 20:
        return '20s'
    else:
        return '10s'

In [None]:
#  Create age group: (2) use "apply()" method

Pdata['Age_G'] = Pdata.apply(lambda row: Age_category(row), axis=1)

Pdata


In [None]:
#  Categorize job: "apply()" method
## (1) create a function for categorizing job
def Job_category(row):
    if (row['Job'] == 7)|(row['Job'] == 8):
        return 'Unemployed'
    else:
        return 'Employed'

## (2) use "apply()" method
Pdata['Job_G'] = Pdata.apply(lambda row: Job_category(row), axis=1)

Pdata.head()


In [None]:
#  Expanding display
pd.set_option('display.max_columns',None)
Pdata.head()

### Select subset

In [None]:
# Older than 50 years
Pdata_gt50 = Pdata[Pdata["Age"]>50]
Pdata_gt50

In [None]:
Pdata_gt50.describe()

In [None]:
# Select subset
# 60s & Employed
Pdata_60s_Emp = Pdata[(Pdata["Age_G"]=="60s")&(Pdata["Job_G"]=="Employed")]
Pdata_60s_Emp

In [None]:
Pdata_60s_Emp.describe()

# Basic Data Exploration Using Pandas

### Pivot table

In [None]:
# Average number of trips by age group and occupancy
pd.pivot_table(Pdata, values='NofTrips', index=['Age_G'], columns=['Job_G'], aggfunc=np.mean)

In [None]:
# Average number of trips by age group and occupancy + margin
pd.pivot_table(Pdata, values='NofTrips', index=['Age_G'], columns=['Job_G'], aggfunc=np.mean, margins=True)

In [None]:
# Average shopping duration by household income and gender
pd.pivot_table(Pdata, values='Shop_AD', index=['HHinc'], columns=['Gender'], aggfunc=np.mean)

In [None]:
# Average number of trips by carownership and age group
pd.pivot_table(Pdata, values='NofTrips', index=['Ncars'], columns=['Age_G'], aggfunc=np.mean)

### Group data (Aggregate)

In [None]:
# Extracting "Gu" index
Pdata["Dcode"] = Pdata["Dcode"].apply(str)
Pdata["Gcode"] = Pdata.Dcode.apply(lambda x: x[:4])


Pdata

In [None]:
# Grouping by "Gu"
G_summary = Pdata.groupby('Gcode').mean()
G_summary

In [None]:
# Sorting with respect to the number of trips
G_summary.sort_values(by=['NofTrips'])

In [None]:
# Sorting with respect to the number of trips
G_summary.sort_values(by=['NofTrips'], ascending=False)

# More information about Data Analysis Using Pandas

### pandas API document
https://pandas.pydata.org/docs/reference/index.html

# Data Visualization Using Pandas

### Line and bar charts

In [None]:
# Example data
df = pd.pivot_table(Pdata, values='NofTrips', index=['Age_G'], columns=['Job_G'], aggfunc=np.mean)
df

In [None]:
# Line plot
df.plot()

In [None]:
# Add label
df.plot(ylabel='Average number of trips', xlabel='Age')

In [None]:
# Bar plot
df.plot(kind='bar')

In [None]:
# Add label
df.plot(kind='bar',ylabel='Average number of trips', xlabel='Age')

### Scatter plot

In [None]:
# Scatter plot with respect to age and travel time
Pdata.plot.scatter(x='Age', y='TripTime')

In [None]:
# Coloring
Pdata.plot.scatter(x='Age', y='TripTime',
                  c=['C0' if c =="Employed" else 'C1' for c in Pdata.Job_G])

### Histogram

In [None]:
# Histogram with respect to travel time
Pdata.TripTime.hist()

In [None]:
# Range control
Pdata.TripTime.hist(bins=30)

### Box plot
- Properties
![image-2.png](attachment:image-2.png)

In [None]:
# Box plot with respect to travle time
Pdata.boxplot(column='TripTime')

In [None]:
# Box plot with respect to travle time by employment status
Pdata.boxplot(column='TripTime', by='Job_G')

# Data Visualization Using Matplotlib

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
# Create a plot
fig, ax = plt.subplots()

### Line plot

In [None]:
# Line plot: an example
fig, ax = plt.subplots()
ax.plot([1,2,3,4], [1,4,2,3]);

### Box plot

In [None]:
# Box plot with respect to Trip time
fig, ax = plt.subplots()
ax.boxplot(Pdata.TripTime); #Due to "NaN" , it doesn't work.

In [None]:
Pdata.head(20)

In [None]:
# Box plot with respect to Trip time with non-trip data
fig, ax = plt.subplots()
ax.boxplot(Pdata.TripTime[Pdata.Trip_made==1]);

### Multiple plots

In [None]:
# Create multiple plots
fig, axes = plt.subplots(nrows=1, ncols=4)

In [None]:
# Multiple plots
fig, axes = plt.subplots(nrows=1, ncols=4)
Pdata.boxplot(column='TripTime', by='Job_G', ax=axes[0])
Pdata.boxplot(column='NofTrips', by='Job_G', ax=axes[1])
Pdata.boxplot(column='Shop_TT', by='Job_G', ax=axes[2])
Pdata.boxplot(column='Shop_AD', by='Job_G', ax=axes[3])

In [None]:
# Add adjustment
fig, axes = plt.subplots(nrows=1, ncols=4)
Pdata.boxplot(column='TripTime', by='Job_G', ax=axes[0])
Pdata.boxplot(column='NofTrips', by='Job_G', ax=axes[1])
Pdata.boxplot(column='Shop_TT', by='Job_G', ax=axes[2])
Pdata.boxplot(column='Shop_AD', by='Job_G', ax=axes[3])
plt.suptitle('')
plt.tight_layout()

### Bar plot

In [None]:
G_summary = Pdata.groupby('Gcode').mean()
fig, ax = plt.subplots()
ax.bar(G_summary.index, G_summary.Work_TT, color=['C1','C2','C3','C4','C5']) # 주의할것 Gcode로 하면 안됨. index로 해야함(Grouping 한 경우)
ax.set_xlabel("Gu code")
ax.set_ylabel("Average commuting time")
plt.title("Comparison of commuting time by Gu")


### For more information about Matplotlib
https://matplotlib.org/stable/tutorials/introductory/usage.html#sphx-glr-tutorials-introductory-usage-py

# Data Visualization Using Seaborn

In [None]:
import seaborn as sns

### Histogram and Kernel density

In [None]:
# Histogram with respec to trip time
sns.displot(data = Pdata, x="TripTime")

In [None]:
# Kernel density with respec to trip time
sns.displot(data = Pdata, x="TripTime", kind="kde")

In [None]:
# Overlap
sns.displot(data = Pdata, x="TripTime", kde=True)

In [None]:
# Comparison by employment status
sns.displot(data = Pdata, x="TripTime", hue="Job_G", kind="kde")

In [None]:
# Comparison by age group
sns.displot(data = Pdata, x="TripTime", hue="Age_G", kind="kde")

### Density plots with 2 dimensions

In [None]:
# Density plot with respect to travel time and commute time
sns.displot(data = Pdata, x="TripTime", y="Work_TT")

In [None]:
# Density plot with respect to travel time and commute time coloring by employment status
sns.displot(data = Pdata, x="TripTime", y="Work_TT", hue="Job_G")

In [None]:
# Kernel density plot with respect to travel time and commute time
sns.displot(data = Pdata, x="TripTime", y="Work_TT", kind="kde")

In [None]:
# Kernel density plot with respect to travel time and commute time by employment status
sns.displot(data = Pdata, x="TripTime", y="Work_TT", kind="kde", hue="Job_G")

### Joint plot

In [None]:
# Joint plot with respect to shopping travel time and shopping duration
sns.jointplot(data = Pdata[(Pdata.Shop_AD>0)], x="Shop_TT", y="Shop_AD")

In [None]:
# Joint plot with respect to shopping travel time and shopping duration by employemnt status
sns.jointplot(data = Pdata[(Pdata.Shop_AD>0)], x="Shop_TT", y="Shop_AD", hue="Job_G")

### For more information
"seaborn" manual: https://seaborn.pydata.org/#:~:text=Seaborn%20is%20a%20Python%20data,introductory%20notes%20or%20the%20paper.