# Machine learning steps:

### 1. Gather data
### 2. Preprocess data
### 3. Choose model
### 4. train model
### 5. test the model
### 6. Hypertuning
### 7. Test against real world data

# Pandas

In [8]:
import pandas as pd
import numpy as np

In [None]:
# numpy 
"""
2 - scalar
[1, 2] - vector
[[1, 2], [3, 4]] - matrix
"""
# pandas
"""
1d - series - homogenous array, size-immutable
2d - Dataframe - heterogenously typed columns, size-mutable
"""

#### Series - array like
#### Dataframe - tabular

## Series

In [11]:
# pandas.Series(data, index, dtype)

In [17]:
series = pd.Series([1, 2, 3, 4, 5])
print(series)

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [19]:
series = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
print(series)

a    1
b    2
c    3
d    4
e    5
dtype: int64


## Creating series using numpy

In [26]:
series1 = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) ##randn will take -ve numbers as well, -1 to 1 numbers
print(series1)

a   -1.043814
b    0.937861
c   -1.119614
d   -0.398839
e    0.092144
dtype: float64


## Creating series using dictionary 

In [29]:
dictionary = {'a':1, 'b':2, 'c':3}
d1 = pd.Series(dictionary)
d1

a    1
b    2
c    3
dtype: int64

#### [] - series
#### [[]] - dataframe

In [34]:
# d1.iloc[[r, c]] - index based location fetching
# d1.loc[[r, c]] - row and column based location fetching

In [36]:
d1[[1, 2]]

  d1[[1, 2]]


b    2
c    3
dtype: int64

In [38]:
d1.iloc[[1, 2]]

b    2
c    3
dtype: int64

In [42]:
series + series1 #index type must match for addition operation else gives wrong result

a   -0.043814
b    2.937861
c    1.880386
d    3.601161
e    5.092144
dtype: float64

In [44]:
series2 = pd.Series(np.random.randn(5))
series2

0    0.111623
1    1.482713
2   -1.337679
3    0.005507
4   -0.170813
dtype: float64

In [46]:
series + series2

a   NaN
b   NaN
c   NaN
d   NaN
e   NaN
0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
dtype: float64

# Dataframe

In [49]:
df = pd.DataFrame()
print(df)

Empty DataFrame
Columns: []
Index: []


In [51]:
df = pd.DataFrame(columns=['c1', 'c2', 'c3'])
print(df)

Empty DataFrame
Columns: [c1, c2, c3]
Index: []


In [69]:
df = pd.DataFrame(columns=['c1', 'c2', 'c3', 'c4'], index=range(1, 6)) #index means row count
df

Unnamed: 0,c1,c2,c3,c4
1,,,,
2,,,,
3,,,,
4,,,,
5,,,,


In [71]:
dictionary = {'a':1, 'b':2, 'c':3}
df1 = pd.DataFrame([dictionary]) #keys will be taken as columns and values as their value at row number 1
df1

Unnamed: 0,a,b,c
0,1,2,3


In [73]:
print(df1.index)
print(df.index)

RangeIndex(start=0, stop=1, step=1)
RangeIndex(start=1, stop=6, step=1)


In [75]:
print(df1.columns)
print(df.columns)

Index(['a', 'b', 'c'], dtype='object')
Index(['c1', 'c2', 'c3', 'c4'], dtype='object')


In [77]:
mylist = [
    ['Apple', 'Red'],
    ['Banana', 'Yellow'],
    ['Orange', 'orange']
]
mydata = pd.DataFrame(mylist)
mydata

Unnamed: 0,0,1
0,Apple,Red
1,Banana,Yellow
2,Orange,orange


In [79]:
mydata = pd.DataFrame(mylist, columns=['Fruit name', 'color'])
mydata

Unnamed: 0,Fruit name,color
0,Apple,Red
1,Banana,Yellow
2,Orange,orange


### Dataframe using numpy array

In [82]:
mylist1 = np.array([
    [0, 1],
    [2, 3],
    [3, 4]
])
mydf = pd.DataFrame(mylist1, columns=['Even', 'Odd'])
mydf

Unnamed: 0,Even,Odd
0,0,1
1,2,3
2,3,4


### Load a csv data using pandas

In [87]:
df = pd.read_csv('./csv files/cereals.csv')
df

Unnamed: 0,name,calories,protein,vitamins,rating
0,100% Bran,70,4,25,68.402973
1,100% Natural Bran,120,3,0,33.983679
2,All-Bran,70,4,25,59.425505
3,All-Bran with Extra Fiber,50,4,25,93.704912
4,Almond Delight,110,2,25,34.384843
5,Apple Cinnamon Cheerios,110,2,25,29.509541
6,Apple Jacks,110,2,25,33.174094
7,Basic 4,130,3,25,37.038562
8,Bran Chex,90,2,25,49.120253
9,Bran Flakes,90,3,25,53.313813
