# Module 4 - Machine Learning

Last class we saw some complex tables based operations. 

In [1]:
import sklearn
from sklearn.linear_model import LogisticRegression
#from sklearn.linear_model import KNNClassifier
import pandas as pd
import altair as alt
import numpy as np

In this dataset we will start with a classic ML dataset.

In [2]:
df = pd.read_csv("data/DataSet.csv")
df

Unnamed: 0,class,Split,feature1,feature2
0,0,Test,1,8918653
1,1,Test,2,8918653
2,0,Train,1,8550405
3,1,Train,1,8550405
4,2,Los Angeles,United States,3971883
...,...,...,...,...
92,90,Surrey,Canada,526004
93,91,Ciudad López Mateos,Mexico,523296
94,92,Tultitlán,Mexico,520557
95,93,Fresno,United States,520052


## Training versus Test.

In [3]:
df_train = df.loc[df["Split"] == "Train"]
chart = (alt.Chart(df_train)
    .mark_point()
    .encode(
        x = "feature1",
        y = "feature2",
        color=["class"]
    ))

In [4]:
df_test = df.loc[df["Split"] == "Test"]
chart = (alt.Chart(df_test)
    .mark_point()
    .encode(
        x = "feature1",
        y = "feature2",
        color = ["class"]
    ))

## Machine Learning

Declarative Machine Learning
*
* 

In [5]:
model = LogisticRegression()
model.fit(df_train[["feature1", "feature2"]],
          df_train["class"])

LogisticRegression()

Predict

In [6]:
df_test["predict"] = model.predict(df_test[["feature1", "feature2"]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["predict"] = model.predict(df_test[["feature1", "feature2"]])


In [7]:
chart = (alt.Chart(df_test)
    .mark_point()
    .encode(
        x = "feature1",
        y = "feature2",
        color = ["class", "predict"]
    ))
chart

SchemaValidationError: Invalid specification

        altair.vegalite.v4.schema.core.FacetedEncoding->0, validating 'type'

        [{'type': 'quantitative', 'field': 'class'}, {'type': 'quantitative', 'field': 'predict'}] is not of type 'object'
        

alt.Chart(...)

In [8]:
chart = (alt.Chart(df_test)
    .mark_point()
    .encode(
        x = "feature1",
        y = "feature2",
        color = ["class", "predict"]
    ))
chart

SchemaValidationError: Invalid specification

        altair.vegalite.v4.schema.core.FacetedEncoding->0, validating 'type'

        [{'type': 'quantitative', 'field': 'class'}, {'type': 'quantitative', 'field': 'predict'}] is not of type 'object'
        

alt.Chart(...)

Alternative data 

In [9]:
df = pd.read_csv("data/DataSet.csv")
df

Unnamed: 0,class,Split,feature1,feature2
0,0,Test,1,8918653
1,1,Test,2,8918653
2,0,Train,1,8550405
3,1,Train,1,8550405
4,2,Los Angeles,United States,3971883
...,...,...,...,...
92,90,Surrey,Canada,526004
93,91,Ciudad López Mateos,Mexico,523296
94,92,Tultitlán,Mexico,520557
95,93,Fresno,United States,520052


In [10]:
df_train = df.loc[df["Split"] == "Train"]

In [11]:
model = LogisticRegression()
model.fit(df_train[["feature1", "feature2"]],
          df_train["class"])

LogisticRegression()

In [12]:
df_test = df.loc[df["Split"] == "Test"]
df_test["predict"] = model.predict(df_test[["feature1", "feature2"]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["predict"] = model.predict(df_test[["feature1", "feature2"]])


Alternative Approach

model = KNNClassifier()
model.fit(df_train[["feature1", "feature2"]],
          df_train["class"])

In [13]:
df_test = df.loc[df["Split"] == "Test"]
df_test["predict"] = model.predict(df_test[["feature1", "feature2"]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test["predict"] = model.predict(df_test[["feature1", "feature2"]])


## Evaluation.

In [14]:
df_test["class"]
df_test["predict"]

0    0
1    0
Name: predict, dtype: int64

Real World example

Temperature classification. 

In [15]:
df = pd.read_csv("data/Temperatures.csv", index_col=0, parse_dates=[1])

In [16]:
check = ((df["Country"] == "United States") &
         (df["dt"].dt.year == 1950) &
         (df["dt"].dt.month == 7) )
df2 = df.loc[check]

In [17]:
out = df2.describe()
out

Unnamed: 0,AverageTemperature,AverageTemperatureUncertainty,Latitude,Longitude
count,35.0,35.0,35.0,35.0
mean,23.382429,0.260914,36.856,-98.331714
std,3.934708,0.084475,4.903182,16.483778
min,15.159,0.133,29.74,-123.46
25%,20.3955,0.1995,32.95,-113.69
50%,23.267,0.256,36.17,-96.7
75%,25.8975,0.2895,39.38,-85.265
max,31.334,0.505,47.42,-72.0


In [18]:
chart = alt.Chart(df2).mark_point().encode(
    y = "AverageTemperature",
    x = "Latitude",
    tooltip=["City", "Country"],
)
chart

In [19]:
model = sklearn.linear_model.LinearRegression()
model.fit(df2[["Latitude"]], df2["AverageTemperature"])

LinearRegression()

In [20]:
df_pred = pd.DataFrame({"Latitude": np.linspace(25, 50, 10)})
df_pred["AverageTemperature"] = model.predict(df_pred[["Latitude"]])
df_pred

Unnamed: 0,Latitude,AverageTemperature
0,25.0,31.091141
1,27.777778,29.285044
2,30.555556,27.478947
3,33.333333,25.672849
4,36.111111,23.866752
5,38.888889,22.060654
6,41.666667,20.254557
7,44.444444,18.448459
8,47.222222,16.642362
9,50.0,14.836265


In [21]:
chart2 = alt.Chart(df_pred).mark_line(color="red").encode(
    y = "AverageTemperature",
    x = "Latitude",
)
out = chart + chart2
out

In [22]:
model_bad = sklearn.linear_model.LinearRegression()
model_bad.fit(df2[["Longitude"]], df2["AverageTemperature"])

LinearRegression()

In [23]:
df_pred = pd.DataFrame({"Longitude": np.linspace(-150, -75, 10)})
df_pred["AverageTemperature"] = model_bad.predict(df_pred[["Longitude"]])
df_pred

Unnamed: 0,Longitude,AverageTemperature
0,-150.0,21.303444
1,-141.666667,21.638754
2,-133.333333,21.974063
3,-125.0,22.309373
4,-116.666667,22.644682
5,-108.333333,22.979992
6,-100.0,23.315302
7,-91.666667,23.650611
8,-83.333333,23.985921
9,-75.0,24.32123


In [24]:
chart = alt.Chart(df2).mark_point().encode(
    y = "AverageTemperature",
    x = "Longitude",
    tooltip=["City", "Country"],
)
chart2 = alt.Chart(df_pred).mark_line(color="red").encode(
    y = "AverageTemperature",
    x = "Longitude",
)
out = chart + chart2
out

In [25]:
from vega_datasets import data

In [26]:
us_cities_df = df.loc[df["Country"] == "United States"]

In [27]:
states = alt.topo_feature(data.us_10m.url, feature='states')
background = alt.Chart(states).mark_geoshape(
    fill='lightgray',
    stroke='white'
).properties(
    width=500,
    height=300
).project('albersUsa')
points = alt.Chart(df2).mark_point(size=100).encode(
    longitude='Longitude',
    latitude='Latitude',
    color="AverageTemperature",
    tooltip=['City','AverageTemperature']
)
chart = background + points
chart

In [28]:
matr = np.linspace((25, -150), (50, -75), 20)
Lat, Log = np.meshgrid(matr[:, 0], matr[:, 1])
df_pred = pd.DataFrame({"Latitude": Lat.flatten(), "Longitude": Log.flatten()})
df_pred["AverageTemperature"] = model.predict(df_pred[["Latitude"]])

In [29]:
points = alt.Chart(df_pred).mark_circle(size=10).encode(
    longitude='Longitude',
    latitude='Latitude',
    color=alt.Color("AverageTemperature", scale=alt.Scale(scheme="reds"))
)
chart = chart + points
chart

## Input Formats

In [30]:
model = sklearn.linear_model.LinearRegression()
model.fit(df2[["Latitude"]], df2["AverageTemperature"])

LinearRegression()