# Introduction to Numpy

## Import Numpy

In [2]:
import numpy as np

## Creating Arrays

In [2]:
zero_array = np.zeros(10)
zero_array

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [3]:
np.ones(10)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [4]:
np.full(10, 2.5)

array([2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5])

In [6]:
?np.arange

[0;31mDocstring:[0m
arange([start,] stop[, step,], dtype=None, *, like=None)

Return evenly spaced values within a given interval.

Values are generated within the half-open interval ``[start, stop)``
(in other words, the interval including `start` but excluding `stop`).
For integer arguments the function is equivalent to the Python built-in
`range` function, but returns an ndarray rather than a list.

When using a non-integer step, such as 0.1, it is often better to use

Parameters
----------
start : integer or real, optional
    Start of interval.  The interval includes this value.  The default
    start value is 0.
stop : integer or real
    End of interval.  The interval does not include this value, except
    in some cases where `step` is not an integer and floating point
    round-off affects the length of `out`.
step : integer or real, optional
    Spacing between values.  For any output `out`, this is the distance
    between two adjacent values, ``out[i+1] - out[i]``.  The d

In [12]:
np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [15]:
np.linspace(0, 1, 11) # linearly spaced array between start and stop. The length is the last

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])

In [10]:
l = [i for i in range(0, 10, 2)]
a = np.array(l)
a

array([0, 2, 4, 6, 8])

## Multidimensional Arrays

In [16]:
np.ones((10, 10))

array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

In [19]:
m = np.linspace(0, 100, 50)
m = m.reshape(5, 5, 2)

## Indexing and Slicing

In [11]:
a[0]

0

In [13]:
a[0:3:2]

array([0, 4])

In [24]:
# Multidimensional Arrays
# only specific dim
n = m[:, :, 1]
n.reshape(5, 5)

array([[  2.04081633,   6.12244898,  10.20408163,  14.28571429,
         18.36734694],
       [ 22.44897959,  26.53061224,  30.6122449 ,  34.69387755,
         38.7755102 ],
       [ 42.85714286,  46.93877551,  51.02040816,  55.10204082,
         59.18367347],
       [ 63.26530612,  67.34693878,  71.42857143,  75.51020408,
         79.59183673],
       [ 83.67346939,  87.75510204,  91.83673469,  95.91836735,
        100.        ]])

In [25]:
# Both Rows and Columns
n[3, 2]

71.42857142857143

In [26]:
# Only 1 column
n[:, 1]

array([ 6.12244898, 26.53061224, 46.93877551, 67.34693878, 87.75510204])

In [27]:
# only 1 row
n[1, :]

array([22.44897959, 26.53061224, 30.6122449 , 34.69387755, 38.7755102 ])

In [67]:
n[[0,2]] # only first and third row

array([[ 2.04081633,  6.12244898, 10.20408163, 14.28571429, 18.36734694],
       [42.85714286, 46.93877551, 51.02040816, 55.10204082, 59.18367347]])

In [70]:
n[[0,2], [4]] # only first and third row of the last column

array([18.36734694, 59.18367347])

## Randomly Generated Arrays

In [28]:
np.random.seed(12) # Numbers are pseudorandom. So same seed will produce same number

In [29]:
np.random.rand(5,2) # Uniformly distributed array of size 5,2 between 0 and 1

array([[0.15416284, 0.7400497 ],
       [0.26331502, 0.53373939],
       [0.01457496, 0.91874701],
       [0.90071485, 0.03342143],
       [0.95694934, 0.13720932]])

In [35]:
np.random.randn(5,2) # Random sampling of normal distribution array of size 5,2 Normal dist = Mean 0 and standard deviation 1

array([[-2.26090795,  1.31316866],
       [ 1.29948907,  0.48606528],
       [-0.16382624,  0.30209188],
       [ 1.07495638, -0.88582519],
       [ 0.11260228,  0.69414995]])

In [33]:
np.random.randint(low=0, high=100, size=(5,2)) # random integers from the "discrete uniform"

array([[89, 59],
       [88, 79],
       [25, 97],
       [56,  4],
       [96, 68]])

## Element Wise Operations

In [40]:
x = (10 + 5) * (a ** 2)
x

array([  0,  60, 240, 540, 960])

In [43]:
y = 100 * np.random.rand(5,2)
y

array([[64.57394108, 23.32186657],
       [ 6.1134856 , 59.18434925],
       [55.19875155, 84.8015606 ],
       [76.15399143, 37.39910543],
       [54.33145987, 95.44911136]])

In [44]:
y = y[:, 0]
y

array([64.57394108,  6.1134856 , 55.19875155, 76.15399143, 54.33145987])

In [45]:
x + y

array([  64.57394108,   66.1134856 ,  295.19875155,  616.15399143,
       1014.33145987])

In [46]:
x * y

array([    0.        ,   366.80913627, 13247.70037088, 41123.15537011,
       52158.20147835])

## Comparison Operations

In [47]:
x >= 2

array([False,  True,  True,  True,  True])

In [48]:
x > y

array([False,  True,  True,  True,  True])

### Boolean Indexing

In [51]:
x[x < y]

array([0])

## Summarizing Operation

In [53]:
y.min()

6.113485604559465

In [54]:
y.max()

76.15399142613265

In [55]:
y.mean()

51.274325906712406

In [56]:
y.std()

23.91704915623274

# Matrix Operations

## Generating Matrices

In [58]:
a = np.arange(1, 7).reshape(2,3)
a

array([[1, 2, 3],
       [4, 5, 6]])

In [60]:
b = np.linspace(9, 15, 12).reshape(3, 4)
b

array([[ 9.        ,  9.54545455, 10.09090909, 10.63636364],
       [11.18181818, 11.72727273, 12.27272727, 12.81818182],
       [13.36363636, 13.90909091, 14.45454545, 15.        ]])

### Identity Matrix

In [75]:
np.eye(4,4)

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

### Diagonal And Triangular Matrix

In [81]:
np.diag(np.ones((4,4))) # Extracts the diagonal from a matrix

array([1., 1., 1., 1.])

In [85]:
np.triu(np.ones((3,3))) # Extracts the upper triangular from a matrix

array([[1., 1., 1.],
       [0., 1., 1.],
       [0., 0., 1.]])

In [84]:
np.tril(np.ones((3,3))) # Extracts the lower triangular from a matrix

array([[1., 0., 0.],
       [1., 1., 0.],
       [1., 1., 1.]])

## Matrix Multiplication

In [61]:
a.dot(b)

array([[ 71.45454545,  74.72727273,  78.        ,  81.27272727],
       [172.09090909, 180.27272727, 188.45454545, 196.63636364]])

In [62]:
a @ b

array([[ 71.45454545,  74.72727273,  78.        ,  81.27272727],
       [172.09090909, 180.27272727, 188.45454545, 196.63636364]])

## Matrix Inversion

### Create an invertible Matrix

In [99]:
# Create a square matrix
sq = np.random.randint(low=1, high=15, size=(3,3))
# extract the lower or upper triangular to make sure its invertable
sq = np.triu(sq)
sq

array([[11,  8,  8],
       [ 0,  2,  7],
       [ 0,  0,  1]])

### Determinant

In [101]:
np.linalg.det(sq)

22.000000000000004

### Inverse

In [100]:
np.linalg.inv(sq)

array([[ 0.09090909, -0.36363636,  1.81818182],
       [ 0.        ,  0.5       , -3.5       ],
       [ 0.        ,  0.        ,  1.        ]])

# Introduction to Pandas

In [103]:
# library for manpulation tabular data in python
import pandas as pd

## Dataframes

In [106]:
# List of lists
# Each list represnts a row. And each item is a column
# Need to pass another list which contains the name of the columns
data = [
    ['Nissan', 'Stanza', 1991, 138, 4, 'MANUAL', 'sedan', 2000],
    ['Hyundai', 'Sonata', 2017, None, 4, 'AUTOMATIC', 'Sedan', 27150],
    ['Lotus', 'Elise', 2010, 218, 4, 'MANUAL', 'convertible', 54990],
    ['GMC', 'Acadia',  2017, 194, 4, 'AUTOMATIC', '4dr SUV', 34450],
    ['Nissan', 'Frontier', 2017, 261, 6, 'MANUAL', 'Pickup', 32340],
]

columns = [
    'Make', 'Model', 'Year', 'Engine HP', 'Engine Cylinders',
    'Transmission Type', 'Vehicle_Style', 'MSRP'
]


In [105]:
df = pd.DataFrame(data=data, columns=columns)

In [107]:
# List of dicts

data = [
    {
        "Make": "Nissan",
        "Model": "Stanza",
        "Year": 1991,
        "Engine HP": 138.0,
        "Engine Cylinders": 4,
        "Transmission Type": "MANUAL",
        "Vehicle_Style": "sedan",
        "MSRP": 2000
    },
    {
        "Make": "Hyundai",
        "Model": "Sonata",
        "Year": 2017,
        "Engine HP": None,
        "Engine Cylinders": 4,
        "Transmission Type": "AUTOMATIC",
        "Vehicle_Style": "Sedan",
        "MSRP": 27150
    },
    {
        "Make": "Lotus",
        "Model": "Elise",
        "Year": 2010,
        "Engine HP": 218.0,
        "Engine Cylinders": 4,
        "Transmission Type": "MANUAL",
        "Vehicle_Style": "convertible",
        "MSRP": 54990
    },
    {
        "Make": "GMC",
        "Model": "Acadia",
        "Year": 2017,
        "Engine HP": 194.0,
        "Engine Cylinders": 4,
        "Transmission Type": "AUTOMATIC",
        "Vehicle_Style": "4dr SUV",
        "MSRP": 34450
    },
    {
        "Make": "Nissan",
        "Model": "Frontier",
        "Year": 2017,
        "Engine HP": 261.0,
        "Engine Cylinders": 6,
        "Transmission Type": "MANUAL",
        "Vehicle_Style": "Pickup",
        "MSRP": 32340
    }
]

In [108]:
df = pd.DataFrame(data)

In [112]:
# Peeking 
df.head(n=2)

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle_Style,MSRP
0,Nissan,Stanza,1991,138.0,4,MANUAL,sedan,2000
1,Hyundai,Sonata,2017,,4,AUTOMATIC,Sedan,27150


## Series

In [113]:
# Each column is a series
# Dot notation to access
df.Make

0     Nissan
1    Hyundai
2      Lotus
3        GMC
4     Nissan
Name: Make, dtype: object

In [114]:
# Slice notation
df["Make"]

0     Nissan
1    Hyundai
2      Lotus
3        GMC
4     Nissan
Name: Make, dtype: object

In [117]:
# Subset of data frame
df[["Make", "Engine HP"]]

Unnamed: 0,Make,Engine HP
0,Nissan,138.0
1,Hyundai,
2,Lotus,218.0
3,GMC,194.0
4,Nissan,261.0


In [119]:
len(df)

5

In [120]:
# Adding columns
df['id'] = [i for i in range(len(df))]

In [121]:
df

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle_Style,MSRP,id
0,Nissan,Stanza,1991,138.0,4,MANUAL,sedan,2000,0
1,Hyundai,Sonata,2017,,4,AUTOMATIC,Sedan,27150,1
2,Lotus,Elise,2010,218.0,4,MANUAL,convertible,54990,2
3,GMC,Acadia,2017,194.0,4,AUTOMATIC,4dr SUV,34450,3
4,Nissan,Frontier,2017,261.0,6,MANUAL,Pickup,32340,4


In [122]:
# deleting a column
del(df["id"])

## Index

In [123]:
# Indexes are ids of the columns
df.index

RangeIndex(start=0, stop=5, step=1)

In [124]:
# All columns have the same index

df.Make.index, df.Year.index

(RangeIndex(start=0, stop=5, step=1), RangeIndex(start=0, stop=5, step=1))

In [138]:
# Use loc to locate the row by index
# Use slicing syntax
df.loc[0:4:2]

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle_Style,MSRP
0,Nissan,Stanza,1991,138.0,4,MANUAL,sedan,2000
2,Lotus,Elise,2010,218.0,4,MANUAL,convertible,54990
4,Nissan,Frontier,2017,261.0,6,MANUAL,Pickup,32340


In [130]:
# Indexes can be anything
df.index = ['a', 'b', 'c', 'd', 'e']

In [131]:
df.loc['a']

Make                 Nissan
Model                Stanza
Year                   1991
Engine HP             138.0
Engine Cylinders          4
Transmission Type    MANUAL
Vehicle_Style         sedan
MSRP                   2000
Name: a, dtype: object

In [132]:
# iloc can be used to use Positional index
df.iloc[0]

Make                 Nissan
Model                Stanza
Year                   1991
Engine HP             138.0
Engine Cylinders          4
Transmission Type    MANUAL
Vehicle_Style         sedan
MSRP                   2000
Name: a, dtype: object

In [135]:
df = df.reset_index(drop=True)

## Element Wise Operations

In [140]:
# Numerical Operations
df['Engine HP'] * 2

0    276.0
1      NaN
2    436.0
3    388.0
4    522.0
Name: Engine HP, dtype: float64

In [141]:
# Logical Operation
df['Year'] > 2015

0    False
1     True
2    False
3     True
4     True
Name: Year, dtype: bool

In [143]:
# Boolean Indexing with multiple conditions
df[
    (df['Year'] > 2015) & (df['Make'] == 'Nissan')
]

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle_Style,MSRP
4,Nissan,Frontier,2017,261.0,6,MANUAL,Pickup,32340


In [144]:
# String Operations
df['Vehicle_Style'].str.lower()

0          sedan
1          sedan
2    convertible
3        4dr suv
4         pickup
Name: Vehicle_Style, dtype: object

In [146]:
df['Vehicle_Style'].str.replace(" ", "_")

0          sedan
1          Sedan
2    convertible
3        4dr_SUV
4         Pickup
Name: Vehicle_Style, dtype: object

In [148]:
df['Vehicle_Style'] = df['Vehicle_Style'].str.lower().str.replace(" ", "_")
df['Vehicle_Style']

0          sedan
1          sedan
2    convertible
3        4dr_suv
4         pickup
Name: Vehicle_Style, dtype: object

## Summarizing Operations

In [149]:
df.MSRP.describe()

count        5.000000
mean     30186.000000
std      18985.044904
min       2000.000000
25%      27150.000000
50%      32340.000000
75%      34450.000000
max      54990.000000
Name: MSRP, dtype: float64

In [151]:
df.describe().round(2)

Unnamed: 0,Year,Engine HP,Engine Cylinders,MSRP
count,5.0,4.0,5.0,5.0
mean,2010.4,202.75,4.4,30186.0
std,11.26,51.3,0.89,18985.04
min,1991.0,138.0,4.0,2000.0
25%,2010.0,180.0,4.0,27150.0
50%,2017.0,206.0,4.0,32340.0
75%,2017.0,228.75,4.0,34450.0
max,2017.0,261.0,6.0,54990.0


In [152]:
df.Make.nunique()

4

In [153]:
df.nunique()

Make                 4
Model                5
Year                 3
Engine HP            4
Engine Cylinders     2
Transmission Type    2
Vehicle_Style        4
MSRP                 5
dtype: int64

In [154]:
df.isnull()

Unnamed: 0,Make,Model,Year,Engine HP,Engine Cylinders,Transmission Type,Vehicle_Style,MSRP
0,False,False,False,False,False,False,False,False
1,False,False,False,True,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False


In [155]:
df.isnull().sum()

Make                 0
Model                0
Year                 0
Engine HP            1
Engine Cylinders     0
Transmission Type    0
Vehicle_Style        0
MSRP                 0
dtype: int64

## Grouping

In [157]:
df.groupby("Transmission Type").MSRP.mean()

Transmission Type
AUTOMATIC    30800.000000
MANUAL       29776.666667
Name: MSRP, dtype: float64

## Convert to other

In [160]:
df.to_dict(orient='records')

[{'Make': 'Nissan',
  'Model': 'Stanza',
  'Year': 1991,
  'Engine HP': 138.0,
  'Engine Cylinders': 4,
  'Transmission Type': 'MANUAL',
  'Vehicle_Style': 'sedan',
  'MSRP': 2000},
 {'Make': 'Hyundai',
  'Model': 'Sonata',
  'Year': 2017,
  'Engine HP': nan,
  'Engine Cylinders': 4,
  'Transmission Type': 'AUTOMATIC',
  'Vehicle_Style': 'sedan',
  'MSRP': 27150},
 {'Make': 'Lotus',
  'Model': 'Elise',
  'Year': 2010,
  'Engine HP': 218.0,
  'Engine Cylinders': 4,
  'Transmission Type': 'MANUAL',
  'Vehicle_Style': 'convertible',
  'MSRP': 54990},
 {'Make': 'GMC',
  'Model': 'Acadia',
  'Year': 2017,
  'Engine HP': 194.0,
  'Engine Cylinders': 4,
  'Transmission Type': 'AUTOMATIC',
  'Vehicle_Style': '4dr_suv',
  'MSRP': 34450},
 {'Make': 'Nissan',
  'Model': 'Frontier',
  'Year': 2017,
  'Engine HP': 261.0,
  'Engine Cylinders': 6,
  'Transmission Type': 'MANUAL',
  'Vehicle_Style': 'pickup',
  'MSRP': 32340}]