## MLP3

#### Load data and training-test split

In [5]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


# to make this notebook's output stable across runs
np.random.seed(42)

In [18]:
import matplotlib.pyplot as plt
import pandas as pd
import os

# read data from CSV file to dataframe
df = pd.read_csv(r"C:\Users\marik\Documents\GitHub\CPSC4310-MLP\input\kc_sales_cleaned.csv")

# make sure you understand the type of the object
print(type(df))

# check the top five and the botoom five data tuples
print(df.head())
print(df.tail())

<class 'pandas.core.frame.DataFrame'>
   Unnamed: 0          id             date     price  bedrooms  bathrooms  \
0           0  7129300520  20141013T000000  221900.0         3       1.00   
1           1  6414100192  20141209T000000  538000.0         3       2.25   
2           2  5631500400  20150225T000000  180000.0         2       1.00   
3           3  2487200875  20141209T000000  604000.0         4       3.00   
4           4  1954400510  20150218T000000  510000.0         3       2.00   

   sqft_living  sqft_lot  floors  waterfront  ...  yr_built  yr_renovated  \
0         1180      5650     1.0           0  ...      1955             0   
1         2570      7242     2.0           0  ...      1951          1991   
2          770     10000     1.0           0  ...      1933             0   
3         1960      5000     1.0           0  ...      1965             0   
4         1680      8080     1.0           0  ...      1987             0   

   zipcode      lat     long  sqft_l

In [None]:
from sklearn.model_selection import KFold # import k-fold validation

kf = KFold(n_splits=3, random_state=None, shuffle=True) # Define the split - into 2 folds 

kf.get_n_splits(X) # returns the number of splitting iterations in the cross-validator

print(kf) 

In [None]:
tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)

for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    tree_clf.fit(X_train, y_train)
    
    y_pred = tree_clf.predict(X_test)
    
    # Print classification report
    target_names = iris.target_names
    print(classification_report(y_test, y_pred, target_names=target_names))

#### Model 1: Decision tree – information gain (entropy)

In [None]:
from sklearn.model_selection import train_test_split

# change this
X = df.data
y = df.target

# A simple training (1 training)
# # split the data 70% for training, 30% for test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 3, test_size = 0.20)

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier(max_depth=2, criterion='entropy')
tree_clf.fit(X_train, y_train)

In [None]:
from sklearn.model_selection import KFold # import k-fold validation

kf = KFold(n_splits=10, random_state=None, shuffle=True) # Define the split - into 2 folds 

# each fold
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    tree_clf.fit(X_train, y_train)
    
    y_pred = tree_clf.predict(X_test)
    
    # Print classification report for each fold
    target_names = iris.target_names
    print(classification_report(y_test, y_pred, target_names=target_names))

#### Model 2: kNN classifier

In [19]:
# We use bedrooms, bathrooms, sqft_living, sqft_lot, and 'most_recent' attributes
X = df[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'most_recent']]
y = df['price_range']

# default is 75% / 25% train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [20]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 5)

knn.fit(X_train, y_train)

KNeighborsClassifier()

In [21]:
knn.score(X_test, y_test)

0.7672094744633605

In [22]:
y_pred = knn.predict(X_test)
y_pred

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [24]:
# plot a confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)

print(confusion_mat)

correct = confusion_mat[0, 0]+confusion_mat[1, 1]
print('accuracy: ', correct/confusion_mat.sum())
#plot_confusion_matrix(confusion_mat, 4)


# Print classification report
target_names = ['1', '0']

result_metrics = classification_report(y_test, y_pred, target_names=target_names)

print(result_metrics)

metrics_dict = classification_report(y_test, y_pred, 
                            target_names=target_names, output_dict=True)
    
print('precision (weighted):', metrics_dict['weighted avg']['precision'])
print('recall avg (weighted):', metrics_dict['weighted avg']['recall'])
print('accuracy: ', metrics_dict['accuracy'])

[[2914  565]
 [ 693 1232]]
accuracy:  0.7672094744633605
              precision    recall  f1-score   support

           1       0.81      0.84      0.82      3479
           0       0.69      0.64      0.66      1925

    accuracy                           0.77      5404
   macro avg       0.75      0.74      0.74      5404
weighted avg       0.76      0.77      0.77      5404

precision (weighted): 0.7643129773013678
recall avg (weighted): 0.7672094744633605
accuracy:  0.7672094744633605


In [9]:
from sklearn.model_selection import KFold # import k-fold validation

# create k-fold validation
kf = KFold(n_splits=3, random_state=None, shuffle=True) # Define the split - into 3 folds

In [10]:
# change DataFrame to numpy array
XX = X.to_numpy()

knn = KNeighborsClassifier(n_neighbors = 3)

precision_sum = recall_sum = accuracy_sum = 0

for train_index, test_index in kf.split(XX):
    X_train, X_test = XX[train_index], XX[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    
    result_metrics_dict = classification_report(y_test, y_pred, output_dict=True)
    
    precision_sum += result_metrics_dict["weighted avg"]["precision"]
    recall_sum += result_metrics_dict["weighted avg"]["recall"]
    accuracy_sum += result_metrics_dict["accuracy"]

print("Avg precision (weighted):", precision_sum/kf.get_n_splits(X))
print("Avg recall (weighted):", recall_sum/kf.get_n_splits(X))
print("Accuracy:", accuracy_sum/kf.get_n_splits(X))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Avg precision (weighted): 0.003537318936923646
Avg recall (weighted): 0.0036089711899588515
Accuracy: 0.0036089711899588515


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Model 3: Naive Bayes classifier or logistic regression??? idk

#### Model evaluation