# Arrays

In [65]:
# see https://docs.scipy.org/doc/numpy-1.15.0/user/index.html
import numpy as np

x = np.array([12, 3, 6, 14])
print ("x: ", x)
print(type(x))
print("x.ndim: ", x.ndim)
print("x.shape: ", x.shape)
print(x[0])

x:  [12  3  6 14]
<class 'numpy.ndarray'>
x.ndim:  1
x.shape:  (4,)
12


In [66]:
x = np.array([  [1, 2, 3, 4],
                [5, 6, 7, 8],
                [9, 10, 11, 12]
            ])
print("x: ", x)
print(type(x))
print("x.ndim: ", x.ndim)
print("x.shape: ", x.shape)
print(x[0,1])

x:  [[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]
<class 'numpy.ndarray'>
x.ndim:  2
x.shape:  (3, 4)
2


In [67]:
x = np.array([
                [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],
                [[21, 22, 23, 24], [25, 26, 27, 28], [29, 30, 31, 32]]
            ])

print("x.shape: ", x.shape)
print(x[0,1,2])

x.shape:  (2, 3, 4)
7


## Array Slice

In [68]:
import numpy as np

arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

print(arr[3:8])
print(arr[3:8:2])
print(arr[3:-1])
print(arr[3:])
print(arr[:5])

[3 4 5 6 7]
[3 5 7]
[3 4 5 6 7 8 9]
[ 3  4  5  6  7  8  9 10]
[0 1 2 3 4]


In [69]:
new = arr[0:3]
print(len(new))
print(new)

3
[0 1 2]


In [70]:
new[0] = 100
print(new[0])
print(arr[0])
print(arr)

100
100
[100   1   2   3   4   5   6   7   8   9  10]


## Array Reshape

In [71]:
import numpy as np

arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])

# b = arr.reshape(3, 4)
b = arr.reshape(3, 4).copy()
print(b)

b[0,0] = 100
print(b)
print(arr)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
[[100   1   2   3]
 [  4   5   6   7]
 [  8   9  10  11]]
[ 0  1  2  3  4  5  6  7  8  9 10 11]


In [72]:
b = arr.reshape(2,2,3)
print(b)

[[[ 0  1  2]
  [ 3  4  5]]

 [[ 6  7  8]
  [ 9 10 11]]]


In [73]:
b = arr.reshape(2,2,-1)
print(b)

[[[ 0  1  2]
  [ 3  4  5]]

 [[ 6  7  8]
  [ 9 10 11]]]


In [74]:
# error
b = arr.reshape(2, 6)
print(b)

[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]]


# Dataframes

In [75]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html

import pandas as pd

df = pd.read_csv('../data/iris.csv')
print(type(df))

<class 'pandas.core.frame.DataFrame'>


In [76]:
print(df.head())

   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa


In [77]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [78]:
print(df.columns)

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')


In [79]:
print(df.shape)

(150, 5)


In [80]:
print(df.index)

RangeIndex(start=0, stop=150, step=1)


In [81]:
print(df.iloc[1])

sepal_length       4.9
sepal_width        3.0
petal_length       1.4
petal_width        0.2
species         setosa
Name: 1, dtype: object


In [82]:
print(df.iloc[1,1])

3.0


In [83]:
df2 = df.copy()
# df2 = df

df2.iloc[1,1] = 100
print(df2.iloc[1,1])
print(df.iloc[1,1])

100.0
3.0


In [84]:
print(df.dtypes)

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object


## Dataframes - Selecting Columns

In [85]:
X = df[["sepal_length","sepal_width","petal_length","petal_width"]]
y = df["species"]
print(type(X))
print(type(y))
print(X.shape)
print(y.shape)
# print(X.head())
# print(y.head())

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
(150, 4)
(150,)


In [86]:
X = df[["sepal_length","sepal_width","petal_length","petal_width"]].to_numpy()
y = df["species"].to_numpy()
print(type(X))
print(type(y))

# print(y)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [87]:
X = df.drop("species", axis="columns")
y = df.species
X.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


## Confusion Matrix

In [88]:
from sklearn.metrics import confusion_matrix
actual = [0, 1, 0, 1]
predicted = [1, 1, 1, 0]

cm = confusion_matrix(actual, predicted)
print(type(cm))

print(" ", "0 ", "1 ", "P ")
print(0, "TN", "FP")
print(1, "FN", "TP")
print("A")
print()

print("confusion matrix")
print(cm)
print()

<class 'numpy.ndarray'>
  0  1  P 
0 TN FP
1 FN TP
A

confusion matrix
[[0 2]
 [1 1]]



In [89]:
# ravel - returns a flattened array in row major style
# print(type(cm.ravel()))
tn, fp, fn, tp = cm.ravel()
print("TN", tn, "FP", fp, "FN", fn, "TP", tp)

TN 0 FP 2 FN 1 TP 1


In [90]:
from sklearn.metrics import confusion_matrix
actual = ['B', 'M', 'B', 'M']
predicted = ['M', 'M', 'M', 'B']

# in aphabetical order, B is 0, M is 1

cm = confusion_matrix(actual, predicted)
print(type(cm))

print(" ", "B ", "M ", "P ")
print("B", "TN", "FP")
print("M", "FN", "TP")
print("A")
print()

print("CM", cm)
print()

tn, fp, fn, tp = cm.ravel()
print("TN", tn, "FP", fp, "FN", fn, "TP", tp)

<class 'numpy.ndarray'>
  B  M  P 
B TN FP
M FN TP
A

CM [[0 2]
 [1 1]]

TN 0 FP 2 FN 1 TP 1


### Specifying Labels

In [91]:
from sklearn.metrics import confusion_matrix

actual = ['W', 'M', 'W', 'M']
predicted = ['M', 'M', 'M', 'W']

# labels forces W to 0 (negative) and M to 1 (positive)
cm = confusion_matrix(actual, predicted, labels=["W", "M"])
print(type(cm))

print(" ", "W ", "M ", "P ")
print("W ", "TN", "FP")
print("M ", "FN", "TP")
print("A")
print()

print("CM", cm)
print()

tn, fp, fn, tp = cm.ravel()
print("TN", tn, "FP", fp, "FN", fn, "TP", tp)

<class 'numpy.ndarray'>
  W  M  P 
W  TN FP
M  FN TP
A

CM [[0 2]
 [1 1]]

TN 0 FP 2 FN 1 TP 1


### Multiple Classes

In [92]:
from sklearn.metrics import confusion_matrix

actual = ["cat", "ant", "cat", "cat", "ant", "bird"]
predicted = ["ant", "ant", "cat", "cat", "ant", "cat"]
cm = confusion_matrix(actual, predicted)
print(cm)
print()

cm = confusion_matrix(actual, predicted, labels=["ant", "bird", "cat"])
print(cm)

# cm = confusion_matrix(actual, predicted, labels=["ant", "cat", "bird"])
# print(cm)

[[2 0 0]
 [0 0 1]
 [1 0 2]]

[[2 0 0]
 [0 0 1]
 [1 0 2]]


### Attack Data

In [93]:
from sklearn.metrics import confusion_matrix

actual = ['attack', 'normal', 'normal',  'attack', 'normal', 'normal', 'attack', 'normal', 'attack']
predicted = ['attack', 'normal', 'attack', 'attack', 'normal', 'normal', 'normal',  'normal', 'normal']

cm = confusion_matrix(actual, predicted, labels=["normal", "attack"])
print(cm)

tn, fp, fn, tp = cm.ravel()
print("TN", tn, "FP", fp, "FN", fn, "TP", tp)


[[4 1]
 [2 2]]
TN 4 FP 1 FN 2 TP 2


## Builtin Datasets

In [94]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
import numpy as np

iris = load_iris()
print(type(iris))
print(iris.feature_names)
print(iris.target_names)

<class 'sklearn.utils._bunch.Bunch'>
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
['setosa' 'versicolor' 'virginica']


In [95]:
print(iris)

{'data': array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
     

In [96]:
# normally
X = iris.data
print("X.shape", X.shape)
y = iris.target
print("y.shape", y.shape)

print(y)

X.shape (150, 4)
y.shape (150,)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [97]:
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

:Number of Instances: 150 (50 in each of three classes)
:Number of Attributes: 4 numeric, predictive attributes and the class
:Attribute Information:
    - sepal length in cm
    - sepal width in cm
    - petal length in cm
    - petal width in cm
    - class:
            - Iris-Setosa
            - Iris-Versicolour
            - Iris-Virginica

:Summary Statistics:

                Min  Max   Mean    SD   Class Correlation
sepal length:   4.3  7.9   5.84   0.83    0.7826
sepal width:    2.0  4.4   3.05   0.43   -0.4194
petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

:Missing Attribute Values: None
:Class Distribution: 33.3% for each of 3 classes.
:Creator: R.A. Fisher
:Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
:Date: July, 1988

The famous Iris database, first used by Sir R.A. Fisher. The dataset is taken
from Fis

# Onehot Encoding

In [98]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd

df = pd.read_csv('../data/iris.csv')

In [99]:
# normally
# X = df.drop("species", axis='columns')
# y = df.species

X = pd.get_dummies(df)
X.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species_setosa,species_versicolor,species_virginica
0,5.1,3.5,1.4,0.2,True,False,False
1,4.9,3.0,1.4,0.2,True,False,False
2,4.7,3.2,1.3,0.2,True,False,False
3,4.6,3.1,1.5,0.2,True,False,False
4,5.0,3.6,1.4,0.2,True,False,False


### Mushroom Dataset

In [100]:
df = pd.read_csv('../data/mushrooms.csv')
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,poisonous,convex,smooth,brown,yes,pungent,free,close,narrow,black,...,smooth,white,white,partial,white,one,pendant,black,scattered,urban
1,edible,convex,smooth,yellow,yes,almond,free,close,broad,black,...,smooth,white,white,partial,white,one,pendant,brown,numerous,grasses
2,edible,bell,smooth,white,yes,anise,free,close,broad,brown,...,smooth,white,white,partial,white,one,pendant,brown,numerous,meadows
3,poisonous,convex,scaly,white,yes,pungent,free,close,narrow,brown,...,smooth,white,white,partial,white,one,pendant,black,scattered,urban
4,edible,convex,smooth,gray,no,none,free,crowded,broad,black,...,smooth,white,white,partial,white,one,evanescent,brown,abundant,grasses


In [101]:
X = df.drop("type", axis='columns')
y = df.type

X = pd.get_dummies(X)
X.head()

Unnamed: 0,cap_shape_bell,cap_shape_conical,cap_shape_convex,cap_shape_flat,cap_shape_knobbed,cap_shape_sunken,cap_surface_fibrous,cap_surface_grooves,cap_surface_scaly,cap_surface_smooth,...,population_scattered,population_several,population_solitary,habitat_grasses,habitat_leaves,habitat_meadows,habitat_paths,habitat_urban,habitat_waste,habitat_woods
0,False,False,True,False,False,False,False,False,False,True,...,True,False,False,False,False,False,False,True,False,False
1,False,False,True,False,False,False,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,True,False,False,False,False
3,False,False,True,False,False,False,False,False,True,False,...,True,False,False,False,False,False,False,True,False,False
4,False,False,True,False,False,False,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False
