# Pandas

Python is well suited to handle basic data manipulation tasks, Pandas however offers a higher-level interface specifically designed for data management and analysis. It provides a more efficient, intuitive, and expressive way to work with tabular data. It is one of the most popular and widely used Python libraries for data science.

https://pandas.pydata.org/docs/user_guide/10min.html

In [None]:
# Installed as part of Anaconda (if done according to class)
# If not; install using "pip install pandas"

# Typical to import pandas with alias "pd"
import pandas as pd

## Series and DataFrame

* A *Series* in Pandas is a one-dimensional array holding data of any type. 
* A *DataFrame* in Pandas is a 2 dimensional array, or a table with rows and columns.

Data sets in Pandas are usually multi-dimensional tables (DataFrames). A Series is like a column, a DataFrame is the whole table.

### Series

In [None]:
# from pandas import Series, DataFrame

my_series = pd.Series(["one", "two", "three", "four", "five"])
print(my_series)

# Heads up! If nothing epecified, values are labeled with default index numbers starting on 0

In [None]:
my_series.values

In [None]:
my_series.index

In [None]:
# Add explicit indexes

my_series = pd.Series(["one", "two", "three", "four", "five"], ["a", "b", "c", "d", "e"])
print(my_series)
print(f"Value of index b: {my_series['b']}")

In [None]:
# Assignments (data manipulation)

my_series["b"] += "_more"
print(my_series)

In [None]:
# Get subset of data (filtering)

my_series = my_series[["a", "c", "d", "e"]]
print(my_series)

In [None]:
# Filter on conditions (filtering)

my_series = my_series[my_series != "three"]
print(my_series)

In [None]:
# Manipulate all data

my_series =  my_series * 2
print(my_series)

my_lambda = lambda s : s * 2
my_series = my_lambda(my_series)
print(my_series)


In [None]:
# Check for existence (key)

print("a" in my_series)
print("x" in my_series)

In [None]:
# Simplify it a bit, use a dict

cities = {'Oslo':634293, 'Bergen':271949, 'Kristiansand':85983}
city_series = pd.Series(cities)
print(city_series)

In [None]:
# Restrict what you want

cities = {'Oslo':634293, 'Bergen':271949, 'Kristiansand':85983}
city_series = pd.Series(cities, ["Oslo", "Bergen"])
print(city_series)

In [None]:
# Properly name it all
cities = {'Oslo':634293, 'Bergen':271949, 'Kristiansand':85983}
city_series = pd.Series(cities, name='Population')
# city_series.name = "Population"
city_series.index.name = "City"
print(city_series)

### DataFrame
* Support named/labeled rows & columns
* Can perform arithmetic operations on rows and columns
* Support reading files like CSV, Excel and JSON

In [None]:
# Simple example

names = ["Kari", "Ola", "Per"]
ages = [50, 40, 45]
professions = ["Doctor", "Bus Driver", "Engineer"]

person_dataframe = pd.DataFrame(list(zip(names, ages, professions)))
print(person_dataframe)

In [None]:
# Name columns (simple example)

person_dataframe = pd.DataFrame(list(zip(names, ages, professions)), columns=["Name", "Age", "Profession"])
print(person_dataframe)

In [None]:
# Start index at another value

person_dataframe.index += 1
print(person_dataframe)

In [None]:
# Custom index
person_dataframe.index = ["a", "b", "c"]
print(person_dataframe)

In [None]:
# Pick only a few elements

# From the top
print(person_dataframe.head(2)) # Default 5 first

# From the end
print(person_dataframe.tail(1)) # Default 5 last

In [None]:
# Check data types

print(person_dataframe.dtypes)
print(person_dataframe["Age"].dtype)

In [None]:
# Simple operations

over_40_dataframe = person_dataframe[person_dataframe["Age"] > 40]
print(over_40_dataframe)

************

In [None]:
# A few other examples (perhaps better?)

persons_dict = {
    "Name": ["Kari", "Ola", "Per"],
    "Age": [50, 40, 45],
    "Profession": ["Doctor", "Bus Driver", "Engineer"]
}

person_dataframe = pd.DataFrame(persons_dict)
#person_dataframe = pd.DataFrame(persons_dict, index=["A", "B", "C"])
print(person_dataframe)

In [None]:
# Only certain elements

person_dataframe = pd.DataFrame(persons_dict, columns=["Name", "Profession"]) # "Age" is omitted
print(person_dataframe)

In [None]:
# Adding column

seniority = [15, 8, 12.5]
person_dataframe["Seniority"] = seniority
print(person_dataframe)
print(person_dataframe["Seniority"].dtype)

In [None]:
# 2 dimentional array as data input

epl_league_result = [
    ["Manchester City", 45, 89],
    ["Arsenal", 28, 85],
    ["Manchester United", 34, 79]
]
epl_dataframe = pd.DataFrame(epl_league_result, columns=["Team", "Goal diff.", "Points"])
epl_dataframe.index += 1 
print(epl_dataframe)


## Read from file