In [7]:
import numpy as np
import pandas as pd

# Filter outliers practice:

data = pd.DataFrame(np.random.standard_normal((1000,4)))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.022225,-0.0016,-0.044129,0.023246
std,1.009217,0.988354,0.969902,0.981056
min,-3.110813,-3.90889,-2.971658,-2.747363
25%,-0.696846,-0.685035,-0.706512,-0.633922
50%,0.023585,-0.009882,-0.094161,0.013029
75%,0.678929,0.684355,0.628456,0.709694
max,3.592921,3.244134,3.697286,3.250988


In [9]:
col = data[2] # extract third column
col[col.abs() > 3]

464    3.109165
938    3.697286
Name: 2, dtype: float64

In [10]:
data[(data.abs() > 3).any(axis="columns")] # extract row with any column value with abs > 3

Unnamed: 0,0,1,2,3
45,1.330506,3.244134,-1.476543,1.420195
83,-1.156686,0.952125,1.826703,3.250988
169,-3.018689,0.256629,-1.084638,-0.488053
209,-0.540454,-3.90889,0.736507,-0.288181
271,-3.110813,2.837066,-1.061828,0.551143
353,3.006018,0.240262,-0.576832,0.170193
450,0.085676,-3.294976,-0.298057,-0.265084
464,-0.59343,-0.945572,3.109165,0.527589
503,3.45879,-0.733642,1.102265,0.307158
561,3.592921,-1.107912,-0.171446,1.897271


In [11]:
# Cap values outside -2 to 2
data[data.abs() > 2] = np.sign(data)*2
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.022494,-0.00026,-0.044204,0.021041
std,0.964551,0.942853,0.934797,0.952382
min,-2.0,-2.0,-2.0,-2.0
25%,-0.696846,-0.685035,-0.706512,-0.633922
50%,0.023585,-0.009882,-0.094161,0.013029
75%,0.678929,0.684355,0.628456,0.709694
max,2.0,2.0,2.0,2.0


In [12]:
# Permutation and Random Sampling
df = pd.DataFrame(np.arange(3*4).reshape((3,4)))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [14]:
sampler = np.random.permutation(3)
sampler

array([0, 2, 1])

In [15]:
df.take(sampler)

Unnamed: 0,0,1,2,3
0,0,1,2,3
2,8,9,10,11
1,4,5,6,7


In [16]:
# Alternative way to code df.take(sampler)
df.iloc[sampler]

Unnamed: 0,0,1,2,3
0,0,1,2,3
2,8,9,10,11
1,4,5,6,7


In [17]:
# Shuffle columns instead
column_sampler = np.random.permutation(4)
df.take(column_sampler, axis="columns")

Unnamed: 0,1,0,3,2
0,1,0,3,2
1,5,4,7,6
2,9,8,11,10


In [18]:
# The sample method selects random rows without replacement
df.sample(n=2)

Unnamed: 0,0,1,2,3
1,4,5,6,7
0,0,1,2,3


In [20]:
# Indicator/dummy variables
df = pd.DataFrame({"key":["b", "b", "a", "c", "a", "b"],
                   "data1": range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [21]:
pd.get_dummies(df["key"], dtype=float)

Unnamed: 0,a,b,c
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0
5,0.0,1.0,0.0


In [22]:
movie_cols = ["movie_id", "title", "genres"]
movies = pd.read_table("datasets/movielens/movies.dat", sep="::", header=None, names=movie_cols, engine="python")
movies[:20]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [23]:
# Use str.get_dummies to pull the different genre types
dummies = movies["genres"].str.get_dummies("|")
dummies.iloc[:10, :6] # extract first 10 rows and first 6 columns

Unnamed: 0,Action,Adventure,Animation,Children's,Comedy,Crime
0,0,0,1,1,1,0
1,0,1,0,1,0,0
2,0,0,0,0,1,0
3,0,0,0,0,1,0
4,0,0,0,0,1,0
5,1,0,0,0,0,1
6,0,0,0,0,1,0
7,0,1,0,1,0,0
8,1,0,0,0,0,0
9,1,1,0,0,0,0


In [24]:
movies_w_indices = movies.join(dummies.add_prefix("Genre: "))
movies_w_indices.iloc[0]

movie_id                                        1
title                            Toy Story (1995)
genres                Animation|Children's|Comedy
Genre: Action                                   0
Genre: Adventure                                0
Genre: Animation                                1
Genre: Children's                               1
Genre: Comedy                                   1
Genre: Crime                                    0
Genre: Documentary                              0
Genre: Drama                                    0
Genre: Fantasy                                  0
Genre: Film-Noir                                0
Genre: Horror                                   0
Genre: Musical                                  0
Genre: Mystery                                  0
Genre: Romance                                  0
Genre: Sci-Fi                                   0
Genre: Thriller                                 0
Genre: War                                      0


7.3 extension data types discusses new support in pandas for data types that numpy doesn't support. Interesting but sort of niche, I won't practice it here.

7.4 String Manipulation

In [25]:
# String munging example: strip extra white space
val = "a,b,    scottarooni"

val.split(",")

['a', 'b', '    scottarooni']

In [26]:
pieces = [x.strip() for x in val.split(",")]
pieces

['a', 'b', 'scottarooni']

In [27]:
# Join pieces together with :: between
"::".join(pieces)

'a::b::scottarooni'

In [32]:
# Practice with regular expressions (regex)

import re

text = "papa      johns\t  kinda sucks    \t."
re.split(r"\s+", text)

# Alternative:
regex = re.compile(r"\s+")
regex.split(text)

['papa', 'johns', 'kinda', 'sucks', '.']

In [34]:
# Example identifying email addresses:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com"""
pattern = r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}"

regex = re.compile(pattern, flags=re.IGNORECASE)

regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [None]:
# String Functions in pandas

In [39]:
# dict with missing data:
data = {"Dave": "dave@google.com", "Steve": "steve@gmail.com", "Rob": "rob@gmail.com", "Wes": np.nan}

data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [40]:
# Series has a contains method which skips over NA values
data.str.contains("gmail")

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [42]:
# Change to string type
data_as_string = data.astype('string')
data_as_string

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                 <NA>
dtype: string