# Summary of DMM

This notebook contains a list of Python methods that were introduced in the DMM programming exercises.

### Import the libraries

In [None]:
import numpy as np  # import auxiliary library, typical idiom
import pandas as pd  # import the Pandas library, typical idiom

# next command ensures that plots appear inside the notebook
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns  # also improves the look of plots
sns.set()
plt.rcParams['figure.figsize'] = 10, 5  # default hor./vert. size of plots, in inches
plt.rcParams['lines.markeredgewidth'] = 1  # to fix issue with seaborn box plots; needed after import seaborn

In [None]:
from sklearn.linear_model import LinearRegression  # for linear regression
from sklearn.cluster import KMeans  # for clustering
from sklearn.tree import DecisionTreeClassifier  # for decision tree mining
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import train_test_split

import sys
sys.path.append('../')
from treeviz import tree_print  # to print decision tree

## Data sets

### Example data set

In [None]:
df_sk = pd.DataFrame([[0, 1], [1, 2], [2, 1], [3, 3], [4, 2], [5, 3], [4, 7],
                      [5, 8], [6, 7], [7, 9], [8, 8], [9, 9]],
                        columns=['a', 'b'])
df_sk.head()

### Auto MPG data set

In [None]:
df_mpg = pd.read_csv('../datasets/auto-mpg.csv')
df_mpg_train, df_mpg_test = train_test_split(df_mpg, test_size=0.2, random_state=42)

# workaround for not knowing whether train_test_split returns copies or views
df_mpg_train = df_mpg_train.copy() 
df_mpg_test = df_mpg_test.copy()

df_mpg_train.head()

### Wheat seeds data set


In [None]:
df_seeds = pd.read_csv('../datasets/seeds.csv')
df_seeds.head()

# <span class="section" id="linear_regression_simple">1.</span> Linear Regression


### Step 1. Select the data


In [None]:
X_reg = df_sk[['a']].copy()  # independent/explanatory variable; just one column in this case
y_reg = df_sk[['b']].copy()  # dependent/response variable; just one column

### Step 2. Create the regression object


In [None]:
reg = LinearRegression()

### Step 3. Fit the model


In [None]:
reg.fit(X_reg, y_reg)

### Step 4.a. Extract model parameters


In [None]:
reg.coef_, reg.intercept_

### Step 4.b. Visualize the model


In [None]:
sns.regplot(X_reg['a'], y_reg['b'], line_kws={'color':'orange'}, ci=None);

### Step 4.c. Assess the model's quality


In [None]:
R2 = reg.score(X_reg, y_reg)
R2

### Step 5. Make predictions


In [None]:
df_sk['predicted'] = reg.predict(X_reg)
df_sk

In [None]:
reg.predict(pd.DataFrame([4.5]))

In [None]:
reg.predict(pd.DataFrame([4.5]))[0, 0]

# <span class="section">2.</span> Decision Tree Mining


### Step 1. Select the data


In [None]:
X_dtc = df_sk[['a', 'b']].copy()
y_dtc = X_dtc.index // 3  # create artificial target variable

In [None]:
ax = X_dtc[y_dtc == 0].plot(kind='scatter', x='a', y='b', s=50, c="red")
X_dtc[y_dtc == 1].plot(kind='scatter', x='a', y='b', s=50, c="blue", ax=ax)
X_dtc[y_dtc == 2].plot(kind='scatter', x='a', y='b', s=50, c="orange", ax=ax)
X_dtc[y_dtc == 3].plot(kind='scatter', x='a', y='b', s=50, c="black", ax=ax)
ax.legend([0, 1, 2, 3])

### Step 2. Create the decision-tree classifier


In [None]:
dtc = DecisionTreeClassifier(max_depth=3)

### Step 3. Do the fitting


In [None]:
dtc.fit(X_dtc, y_dtc)

### Step 4.a. Extract model parameters


In [None]:
dtc.tree_.node_count, dtc.tree_.max_depth

In [None]:
feature_importances = pd.DataFrame([dtc.feature_importances_], columns=X_dtc.columns)
feature_importances

In [None]:
dtc.classes_

### Step 4.b. Visualize the model


In [None]:
tree_print(dtc, X_dtc)

### Step 4.c. Assess the model's quality


In [None]:
dtc.score(X_dtc, y_dtc)

### Step 5. Make predictions


In [None]:
X_dtc_test = [(1, 4), (2, 4), (4, 6), (5, 4), (7, 6), (8, 8)]

In [None]:
dtc.predict(X_dtc_test)

# <span class="section">3.</span> Accuracy and Confusion Matrix


In [None]:
y_true = pd.Series([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3], name='actual')
y_pred = pd.Series([1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1, 1, 1, 1, 3], name='predicted')
y_true.count()

In [None]:
accuracy_score(y_true, y_pred)  # fraction of correct predictions

In [None]:
cm = confusion_matrix(y_true, y_pred)  # cm[i, j] == number of values i predicted as j
cm

In [None]:
values = [1, 2, 3]
pd.DataFrame(cm, index=values, columns=["Predicted " + str(v) for v in values])

# <span class="section">4.</span> Clustering


### Step 1. Select the data


In [None]:
X_km = df_sk[['a', 'b']].copy()

### Step 2. Create the cluster object


In [None]:
km = KMeans(n_clusters=4)

### Step 3. Fit the model


In [None]:
km.fit(X_km)

### Step 4.a. Extract model parameters


In [None]:
centers = pd.DataFrame(km.cluster_centers_, columns=X_km.columns)
centers

In [None]:
X_km['cluster'] = km.labels_
X_km

### Step 4.b. Visualize the model


In [None]:
ax = X_km[X_km['cluster']==0].plot(kind='scatter', x='a', y='b', s=50, c='green')
X_km[X_km['cluster']==1].plot(kind='scatter',x='a',y='b',s=50, c='orange', ax = ax)
X_km[X_km['cluster']==2].plot(kind='scatter',x='a',y='b',s=50, c='purple', ax = ax)
X_km[X_km['cluster']==3].plot(kind='scatter',x='a',y='b',s=50, c='blue', ax = ax)

centers.plot(kind = 'scatter', x='a', y='b', c=['green','orange','purple','blue'], s=50, marker='x', ax=ax)

### Step 4.c. Assess the model's quality


In [None]:
inertia = km.inertia_
inertia

### Step 5. Do predictions


In [None]:
test_km = [(1, 4), (2, 4), (4, 6), (5, 4), (7, 6), (8, 8)]
km.predict(test_km)