In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1.Basics - Indexing, Labelling and Ordering

We'll be using some data from AirBnB for this example: https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data

In [None]:
import pandas as pd

df = pd.read_csv("../input/new-york-city-airbnb-open-data/AB_NYC_2019.csv")
df.head(3)

## Indexing

So this means a lot of things depending on the context. For pandas the index is the number of the left, which is the unique value that can identify each row. By default, the index is generated by counting up from zero. But in this data, we can see that the database index (which is called the primary key) `id` would also be another good choice.

In [None]:
df2 = df.set_index("id")
df2.head(3)

In [None]:
# See how its pulling the index (id)
df2.name[2539]

In [None]:
# We'll cover grouping in way more detail in the next chapter
df3 = df.groupby("room_type").mean()
df3

In [None]:
df3.reset_index()

In [None]:
df3.reset_index(drop=True)

## Sorting

I almost always use `sort_index` after setting it. If I want the df sorted, I commonly use `sort_values`

In [None]:
df3.sort_index(ascending=False)

In [None]:
df.sort_values(["neighbourhood_group", "host_name"])
df.head(3)

In [None]:
df.neighbourhood_group.unique()

In [None]:
df.neighbourhood_group.value_counts()

In [None]:
df.sort_values(["neighbourhood_group", "host_name"], ascending=[False, True], inplace=True)

## Rank

Like sorting, but with collision detection.

In [None]:
dfp = df.sort_values("price", ascending=False)
dfp[["id", "host_name", "price"]].head(5)

In [None]:
dfp["price_rank"] = dfp.price.rank(method="max", ascending=False)

In [None]:
dfp[["id", "host_name", "price", "price_rank"]].head(5)

### Recap:

* set_index
* reset_index
* sort_values
* sort_index
* unique
* value_counts
* rank

## Next up: Slicing and Filtering

# 2.Basics - Slicing and Filtering

Perhaps the two most common activities performed on dataframes. As before, AirBnB data: https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("../input/new-york-city-airbnb-open-data/AB_NYC_2019.csv")
df.head(2)

## Slicing Columns

In [None]:
df["host_name"]

In [None]:
df.host_name

In [None]:
df[["host_name", "neighbourhood_group"]]

## Filtering on rows (mask filtering)

In [None]:
df[df.host_name == "Taz"]

In [None]:
df.host_name == "Taz"

In [None]:
(df.host_name == "Taz").sum()

In [None]:
mask = df.host_name == "Taz"
df[mask].head(2)

In [None]:
quick_and_cheap = (df.price < 100) & (df.minimum_nights < 3)
quick_and_cheap.sum()

In [None]:
df[quick_and_cheap].head(2)

In [None]:
reviews_consistent = df[(df.reviews_per_month > 3) | (df.number_of_reviews > 50)]
reviews_consistent.head(3)

In [None]:
mask = np.logical_or((df.reviews_per_month > 3), (df.number_of_reviews > 50))
df[mask].head(2)

In [None]:
df[~mask].head(2)

## Filtering columns and rows together

The method is `.loc`

In [None]:
df.loc[mask, ["name", "host_name"]]

In [None]:
df.loc[mask, :].head()

## Filtering based on index?

In [None]:
df.iloc[0, 1]

In [None]:
df2 = df.set_index("id")
df2.iloc[0, :]

In [None]:
df2.iloc[1:4, 6:]

## Provided mask helpers

In [None]:
df.loc[df.price.between(100, 200), "price"].head()

In [None]:
df.loc[df.price.isin([100, 200]), "price"].head()

In [None]:
df == "John"

In [None]:
(df == "John").any()

In [None]:
(df == "John").any(axis=1)

## Views vs Copy

A common pitfall of users is to not understand the difference between views and copies.

In [None]:
df2 = df.copy()

In [None]:
df2["name"][0] = "TESTING"
df2.head(1)

In [None]:
df2.loc[df2.index == 0, "name"] = "TESTING2"
df2.head(1)

In [None]:
df2[df2.host_name == "John"]["name"] = "Oh no"
df2.head(1)

### Recap:

* .loc
* .iloc
* .between
* .isin
* .any
* .all
* .copy
* Boolean operators: & | ^ ~
* View vs copy


# 3.Basics - Replacing and Thresholding

Also super common for cleaning datasets.

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("../input/new-york-city-airbnb-open-data/AB_NYC_2019.csv")
df.head(2)

## Dealing with NaNs

In [None]:
df.info()

In [None]:
df.dropna(subset=["last_review"]).info()

In [None]:
df.fillna(0);

## Generic replace

In [None]:
df.replace("John", "Jono").head(1)

In [None]:
df.host_name.replace("John", "Jono", limit=1)[0]

In [None]:
df.replace({"John": "Jono", "Brooklyn": "Brooky"}).head(1)

## Thresholding

In [None]:
import matplotlib.pyplot as plt
plt.hist(df.price);

In [None]:
plt.hist(df.price.clip(upper=1000));

In [None]:
df2 = df.copy()
df2.loc[df2.price > 1000, "price"] = 1000
plt.hist(df2.price);

### Recap

* dropna
* fillna
* replace
* clip
* manual thresholding

# 4.Basics - Removing and Adding Data

Let's mix it up a bit and change our data source. Now we'll look at some astronauts!
https://www.kaggle.com/nasa/astronaut-yearbook

In [None]:
import pandas as pd

df = pd.read_csv("../input/astronaut-yearbook/astronauts.csv")
df.head(1)

## Modifying Type of Columns

Common for time series, categoricals, or converting strings to numeric

### Timeseries

Note we'll TimeSeries with a lot of depth in a later chapter, this is mostly a (useful) intro.

In [None]:
birthdate = pd.to_datetime(df["Birth Date"], format="%m/%d/%Y")
birthdate

In [None]:
birthdate.dt.year

In [None]:
zarya = pd.to_datetime("1998-11-20")
df["age_at_zarya"] = (zarya - birthdate).astype('timedelta64[Y]')
df.head(3)

In [None]:
df["birth"] = birthdate
df.head(3)

### Categoricals

Why use? Information can be utilised by other libraries that pandas interfaces with, you can provide explicit sorting order rathen than lexical order, and huge speed improvements if you group on categories.


In [None]:
df["Military Rank"].unique()

In [None]:
df["Military Rank"].dtype

In [None]:
df["Military Rank"] = df["Military Rank"].astype("category")
df["Military Rank"].dtype

In [None]:
pd.Categorical(df["Military Rank"])

### Numeric / String Conversion

In [None]:
df.head(1)

In [None]:
df.age_at_zarya.astype("str").astype("float").astype("int")[0]

## Removing Columns or Rows

In [None]:
df2 = df[["Name", "Year", "Group"]].copy()
df2.head()

In [None]:
df2.drop("Group", axis=1).head()

In [None]:
df2.drop(1).head()

In [None]:
df2.drop(columns="Group").head()

In [None]:
df2.drop(columns=["Year", "Group"]).head()

## Adding rows

I would have loved to be an astronaut. Alas Australia had no space program when I was a kid.

In [None]:
df2.append({"Name": "Samuel Hinton", "Year": 2010, "Group": 20.0}, ignore_index=True)

In [None]:
df_sis = pd.DataFrame({"Name": ["Al Hinton"], "Year": [2010], "Group": [20.0]})
df_sis

In [None]:
df2.append(df_sis, ignore_index=True)

What if I want it at a specific location? `sort_values`

## Adding Columns

In [None]:
df2["Col1"] = "Whoa"
df2

In [None]:
df2.assign(some_col="someval")

In [None]:
df2.insert(0, "FirstName", df.Name.str.split(" ", 1, expand=True)[0])
df2.head()

## I want to work with rows/columns and I have columns/rows

In [None]:
df3 = df.set_index("Name")
df3.head(2)

In [None]:
df3.T

### Recap

* df["newcol"] = val
* dtypes
* astype
* drop
* append
* assign
* .T

# 5.Basics - Apply, Map and Vectorised Functions

In [None]:
import pandas as pd
import numpy as np

data = np.round(np.random.normal(size=(4, 3)), 2)
df = pd.DataFrame(data, columns=["A", "B", "C"])
df.head()

## Apply

Used to execute an arbitrary function again an entire dataframe, or a subection. Applies in a vectorised fashion.

In [None]:
df.apply(lambda x: 1 + np.abs(x))

In [None]:
df.A.apply(np.abs)

In [None]:
#def double_if_positive(x):
#    if x > 0:
#        return 2 * x
#    return x
#
#df.apply(double_if_positive)

In [None]:
def double_if_positive(x):
    x[x > 0] *= 2
    return x

df.apply(double_if_positive)

In [None]:
df

In [None]:
def double_if_positive(x):
    x = x.copy()
    x[x > 0] *= 2
    return x

df.apply(double_if_positive, raw=True)

## Map

Similar to apply, but operators on Series, and uses dictionary based inputs rather than an array of values.


In [None]:
series = pd.Series(["Steve", "Alex", "Jess", "Mark"])

In [None]:
series.map({"Steve": "Stephen"})

In [None]:
series.map(lambda d: f"I am {d}")

## Vectorised functions

Pandas and numpy obviously have tons of these, here are some examples

In [None]:
display(df, df.abs())

In [None]:
series = pd.Series(["Obi-Wan Kenobi", "Luke Skywalker", "Han Solo", "Leia Organa"])

In [None]:
"Luke Skywalker".split()

In [None]:
series.str.split(expand=True)

In [None]:
series.str.contains("Skywalker")

In [None]:
series.str.upper().str.split()

## User defined functions

Lets investigate a super simple example of trying to find the hypotenuse given x and y distances.


In [None]:
data2 = np.random.normal(10, 2, size=(100000, 2))
df2 = pd.DataFrame(data2, columns=["x", "y"])

In [None]:
hypot = (df2.x**2 + df2.y**2)**0.5
print(hypot[0])

In [None]:
def hypot1(x, y):
    return np.sqrt(x**2 + y**2)

h1 = []
for index, (x, y) in df2.iterrows():
    h1.append(hypot1(x, y))
print(h1[0])

In [None]:
def hypot2(row):
    return np.sqrt(row.x**2 + row.y**2)

h2 = df2.apply(hypot2, axis=1)
print(h2[0])

In [None]:
def hypot3(xs, ys):
    return np.sqrt(xs**2 + ys**2)
h3 = hypot3(df2.x, df2.y)
print(h3[0])

Vectorising everything you can is the key to speeding up your code. Once you've done that, you should use other tools to investigate. PyCharm Professional has a great optimisation tool built in. Jupyter has %lprun (line profiler) command you can find here: https://github.com/rkern/line_profiler

### Recap

* apply
* map
* .str & similar

# 6.Extra Practise - Basics

In this optional practise session, I thought it would be fun to look at some cost of living data from, you guessed it, Kaggle: https://www.kaggle.com/stephenofarrell/cost-of-living

Here are the objectives:

1. Rename the "index" column to "location"
2. Utilise apply to generate two new columns from the location - city and country
3. Realise the easy solution doesn't doesnt work for the United States and create a function for apply to remove specific states.
3. Figure out which country has the most cities listed, and create a dataset from only that country
4. Sort the dataset by the cost of living 'Apartment (1 bedroom) in City Centre'
5. Cry over housing prices if you live in the Bay Area.

After that, feel free to keep playing with the data yourself.


In [None]:
# Code to start you off and manipulate the data. .T is transpose - swap columns and rows
import pandas as pd

df = pd.read_csv("../input/cost-of-living/cost-of-living.csv", index_col=0).T.reset_index()
df.head()

## Rename column

In [None]:
df2 = df.rename(columns={"index": "location"})

## Get city and country

In [None]:
# Split on ", " at most once, expand it to a dataframe, and then assign each column like a tuple
df2[["City", "Country"]] = df2.location.str.split(", ", n=1, expand=True)
df2.head()

In [None]:
# However, there is actually an issue here, because the United States has state as well. 
# Ie "Austin, TX, United States", so we should add a state column
df2.Country = df2.Country.apply(lambda x: x if not "," in x else x.split(", ")[-1])
df2.head()

## Figure out which country has the most cities

In [None]:
df2.Country.value_counts()

In [None]:
most_cities = df2.Country.value_counts().index[0]
most_cities

## Create a subset of only that country

In [None]:
df3 = df2[df2.Country == most_cities]
df3

## Sort by housing accommodation

In [None]:
key = "Apartment (1 bedroom) in City Centre"
df4 = df3.sort_values(key, ascending=False)
df4[["location", key]]