# DSCI 503 - Homework 08
### Matt Snyder

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## Problem 1: Diamonds Dataset

In [2]:
# Load the data stored in the tab-delimited file diamonds.txt into a DataFrame named diamonds. 
diamonds = pd.read_table('diamonds.txt')

# add two new columns to diamonds named ln_carat and ln_price. These columns should contain the
# natural logarithms of the carat and price columns. 
diamonds.loc[:, 'ln_carat'] = np.log(diamonds.carat)
diamonds.loc[:, 'ln_price'] = np.log(diamonds.price)

# Use head() to display the first 5 rows of this DataFrame.
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,ln_carat,ln_price
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,-1.469676,5.786897
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,-1.560648,5.786897
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,-1.469676,5.78996
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,-1.237874,5.811141
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,-1.171183,5.814131


In [8]:
# Create a 2D array named X1_num by selecting the ln_carat column from diamonds.
X1_num = diamonds.loc[:, ['ln_carat']].values # making the column selection an array gives a 2D shape to the result

# Create a 2D array named X1_cat by selecting the cut, color, and clarity columns from diamonds.
X1_cat = diamonds.loc[:, ['cut', 'color', 'clarity']].values

# Create a 1D array named y1 by selecting the ln_price column from diamonds.
y1 = diamonds.loc[:, 'ln_price'].values

# Print the shapes of all three of these arrays with messages as shown below.
print (f'Numerical Feature Array Shape:   {X1_num.shape}')
print (f'Categorical Feature Array Shape: {X1_cat.shape}')
print (f'Label Array Shape:               {y1.shape}')

Numerical Feature Array Shape:   (53940, 1)
Categorical Feature Array Shape: (53940, 3)
Label Array Shape:               (53940,)


In [9]:
# Create a OneHotEncoder() object setting sparse=False.
encoder = OneHotEncoder(sparse_output=False) # sparse=False

# Fit the encoder to the categorical features.
encoder.fit(X1_cat)

# Use the encoder to encode the categorical features, storing the result in a variable named X1_enc.
X1_enc = encoder.transform(X1_cat)

# Print the shape of X1_enc with a message as shown below.
print (f'Encoded Feature Array Shape: {X1_enc.shape}')

Encoded Feature Array Shape: (53940, 20)


In [11]:
# Use np.hstack to combine X1_num and X1_enc into a single array named X1 with the numerical column
# appearing first in the new array.
X1 = np.hstack([X1_num, X1_enc])

# 2. Print the shape of X1 with a message as shown below.
print (f'Feature Array Shape: {X1.shape}')

Feature Array Shape: (53940, 21)


In [12]:
# Use train_test_split() to split the data into training and holdout sets using an 80/20 split. Name the
# resulting arrays X1_train, X1_hold, y1_train, and y1_hold. Set random state=1.
X1_train, X1_hold, y1_train, y1_hold = train_test_split(X1, y1, test_size=0.2, random_state=1)

# Use train_test_split() to split the holdout data into validation and test sets using a 50/50 split. Name the
# resulting arrays X1_valid, X1_test, y1_valid, and y1_test. Set random state=1.
X1_valid, X1_test, y1_valid, y1_test = train_test_split(X1_hold, y1_hold, test_size=0.5, random_state=1)

# Print the shapes of X1_train, X1_valid, and X1_test with messages as shown below
print (f'Training Features Shape:   {X1_train.shape}')
print (f'Validation Features Shape: {X1_valid.shape}')
print (f'Test Features Shape:       {X1_test.shape}')

Training Features Shape:   (43152, 21)
Validation Features Shape: (5394, 21)
Test Features Shape:       (5394, 21)


### Linear Regression Model with One Feature

In [19]:
# Create a linear regression model named dia_mod_1.
dia_mod_1 = LinearRegression()

# Fit the model to the training data, using only the first (numerical) column of X1_train.
X_first_column = X1_train[:,0].reshape(-1, 1)
dia_mod_1.fit(X_first_column, y1_train)

# Calculate the r-squared values for the training and validation set. Note that when using the score() method
# for this model, you will need to provide it with only the first column of the feature array.
X_valid_first_column = X1_valid[:,0].reshape(-1, 1)
train_r2 = dia_mod_1.score(X_first_column, y1_train)
valid_r2 = dia_mod_1.score(X_valid_first_column, y1_valid)

# Print the results with messages as shown below. Round the scores to 4 decimal places.
print (f'Training r-Squared:   {round(train_r2, 4)}')
print (f'Validation r-Squared: {round(valid_r2, 4)}')

Training r-Squared:   0.933
Validation r-Squared: 0.9348


### Linear Regression Model with Several Features

In [20]:
# Create a linear regression model named dia_mod_2.
dia_mod_2 = LinearRegression()

# Fit the model to the training data using all features in X1_train.
dia_mod_2.fit(X1_train, y1_train)

# Calculate the r-squared values for the training and validation set.
train_r2 = dia_mod_2.score(X1_train, y1_train)
valid_r2 = dia_mod_2.score(X1_valid, y1_valid)

# Print the results with messages as shown below. Round the scores to 4 decimal places.
print (f'Training r-Squared:   {round(train_r2, 4)}')
print (f'Validation r-Squared: {round(valid_r2, 4)}')

Training r-Squared:   0.9825
Validation r-Squared: 0.9834


In [22]:
# Score the model dia_mod_2 using the test set.
valid_r2 = dia_mod_2.score(X1_test, y1_test)

# Print the result with a message as shown below. Round the score to 4 decimal places.
print (f'Testing r-Squared: {round(valid_r2, 4)}')

Testing r-Squared: 0.9825
