<a href="https://colab.research.google.com/github/stephaniediamond3/DATA71200/blob/main/sept30/data71200class4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix

!pip install -U scikit-learn==1.4



In [2]:
# Import the California Housing Data from used in the HOML book, Chapter 2

import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [3]:
fetch_housing_data()

def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

  housing_tgz.extractall(path=housing_path)


In [4]:
housing = load_housing_data()

In [5]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
y = housing['median_house_value']
print(y)

0        452600.0
1        358500.0
2        352100.0
3        341300.0
4        342200.0
           ...   
20635     78100.0
20636     77100.0
20637     92300.0
20638     84700.0
20639     89400.0
Name: median_house_value, Length: 20640, dtype: float64


In [7]:
X = housing.drop(['median_house_value','ocean_proximity','total_bedrooms'],axis=1)

print(X)

       longitude  latitude  housing_median_age  total_rooms  population  \
0        -122.23     37.88                41.0        880.0       322.0   
1        -122.22     37.86                21.0       7099.0      2401.0   
2        -122.24     37.85                52.0       1467.0       496.0   
3        -122.25     37.85                52.0       1274.0       558.0   
4        -122.25     37.85                52.0       1627.0       565.0   
...          ...       ...                 ...          ...         ...   
20635    -121.09     39.48                25.0       1665.0       845.0   
20636    -121.21     39.49                18.0        697.0       356.0   
20637    -121.22     39.43                17.0       2254.0      1007.0   
20638    -121.32     39.43                18.0       1860.0       741.0   
20639    -121.24     39.37                16.0       2785.0      1387.0   

       households  median_income  
0           126.0         8.3252  
1          1138.0         8.3

# Testing/training split

In [10]:
from sklearn.model_selection import train_test_split

# split data and labels into a training and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=None)# would need name of var, not none/true

#a balanced split, percentage of samples for each class, can be obtained with  StratifiedShuffleSplit
#note however that the housing dataset is not a good candidate for this approach
# from sklearn.model_selection import StratifiedShuffleSplit
# split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
# for train_index, test_index in split.split(X, y):
#   X_train = X[train_index]
#   X_test = X[test_index]
#   y_train = y[train_index]
#   y_test = y[test_index]

In [11]:
# the stratify split works well however for the iris dataset

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.datasets import load_iris

# load iris dataset
iris = load_iris()

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(iris.data, iris.target):
    X_iris_train = iris.data[train_index]
    X_iris_test = iris.data[test_index]
    y_iris_train = iris.target[train_index]
    y_iris_test = iris.target[test_index]

In [12]:
from sklearn.linear_model import LinearRegression

# instantiate a model and fit it to the training set
linreg = LinearRegression().fit(X_train, y_train)

In [13]:
# evaluate the model on the test set
print("Test set score: {:.2f}".format(linreg.score(X_test, y_test)))

Test set score: 0.62


# Dealing with missing value

In [14]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') #replace missing w mean
imp_mean.fit(X_train)
SimpleImputer()
# apply to both the testing and training data
X_train_new = imp_mean.transform(X_train)
X_test_new = imp_mean.transform(X_test)

In [15]:
# instantiate a model and fit it to the training set
linreg = LinearRegression().fit(X_train_new, y_train)

In [17]:
# evaluate the model on the test set
print("Test set score: {:.2f}".format(linreg.score(X_test_new, y_test)))

Test set score: 0.62


# If you want to see which feature has the missing element  

In [18]:
# you can check each one individually with the following code
housing['longitude'].isnull().values.any()

False

In [19]:
housing['latitude'].isnull().values.any()

False

In [20]:
housing['housing_median_age'].isnull().values.any()

False

In [21]:
housing['total_rooms'].isnull().values.any()

False

In [22]:
housing['total_bedrooms'].isnull().values.any()

True

In [23]:
# although this isn't necessary for running Imputer it may be useful to know where exactly the data is missing
# you can also check how many elements are empty
housing['total_bedrooms'].isnull().values.sum()

207

In [24]:
# or you can generally check if you have any empty elements in your dataframe
#(and thus whether you need to run Imputer)
housing.isnull().values.any()

True

# Include Total Bedrooms with missing values filled in

In [25]:
X_bed = housing.drop(['median_house_value','ocean_proximity'],axis=1)

print(X_bed)

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0        -122.23     37.88                41.0        880.0           129.0   
1        -122.22     37.86                21.0       7099.0          1106.0   
2        -122.24     37.85                52.0       1467.0           190.0   
3        -122.25     37.85                52.0       1274.0           235.0   
4        -122.25     37.85                52.0       1627.0           280.0   
...          ...       ...                 ...          ...             ...   
20635    -121.09     39.48                25.0       1665.0           374.0   
20636    -121.21     39.49                18.0        697.0           150.0   
20637    -121.22     39.43                17.0       2254.0           485.0   
20638    -121.32     39.43                18.0       1860.0           409.0   
20639    -121.24     39.37                16.0       2785.0           616.0   

       population  households  median_income  
0   

In [26]:
# split data and labels into a training and a test set
X_bed_train, X_bed_test, y_bed_train, y_bed_test = train_test_split(X_bed, y, random_state=0, stratify=None)

# create a new instance of imputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(X_bed_train)
SimpleImputer()
# apply to both the testing and training data
X_train_new = imp_mean.transform(X_bed_train)
X_test_new = imp_mean.transform(X_bed_test)

In [27]:
# run regression

# instantiate a model and fit it to the training set
linreg2 = LinearRegression().fit(X_train_new, y_bed_train)

# evaluate the model on the test set
print("Test set score: {:.2f}".format(linreg2.score(X_test_new, y_bed_test)))

Test set score: 0.63


# Metrics

In [28]:
# import Iris dataset
from sklearn.datasets import load_iris
iris_dataset = load_iris()

In [29]:
# split into testing and training sets
X2_train, X2_test, y2_train, y2_test = train_test_split(
    iris_dataset['data'], iris_dataset['target'], random_state=0)

In [30]:
# import KNN clasifier and fit to training data
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X2_train, y2_train)

In [31]:
print("Test set score: {:.2f}".format(knn.score(X2_test, y2_test)))

Test set score: 0.97


In [32]:
# generate list of predictions for y2_test
y2_pred = knn.predict(X2_test)

In [33]:
# generate a confusion matrix on iris data
from sklearn.metrics import confusion_matrix

confusion_matrix(y2_test, y2_pred)

array([[13,  0,  0],
       [ 0, 15,  1],
       [ 0,  0,  9]])

In [34]:
# calculate class-wise precision score on iris data
from sklearn.metrics import precision_score
precision_score(y2_test, y2_pred, average=None)

array([1. , 1. , 0.9])

In [None]:
# calculate class-wise recall score on iris data
from sklearn.metrics import recall_score
recall_score(y2_test, y2_pred, average=None)

array([1.    , 0.9375, 1.    ])

In [None]:
# calculate overall and class-wise F1-score on iris data
from sklearn.metrics import f1_score

# Calculate metrics globally by counting the total true positives, false negatives and false positives.
print(f1_score(y2_test, y2_pred, average='micro'))
# Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.
print(f1_score(y2_test, y2_pred, average='macro'))
# Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters ‘macro’ to account for label imbalance; it can result in an F-score that is not between precision and recall.
print(f1_score(y2_test, y2_pred, average='weighted'))

# Class-wise, no averaging
print(f1_score(y2_test, y2_pred, average=None))

0.9736842105263158
0.9717034521788341
0.9739522830846216
[1.         0.96774194 0.94736842]


In [None]:
# calculate AUC score (ROC implementation in scikit-learn only works for binary classification) on iris data
from sklearn.metrics import roc_auc_score
roc_auc_score(y2_test, knn.predict_proba(X2_test), multi_class='ovr')

0.9838362068965517

In [None]:
# calculate RMSE on housing data
from sklearn.metrics import root_mean_squared_error

y_pred=linreg.predict(X_test)

root_mean_squared_error(y_test, y_pred)



70639.47198102309

In [None]:
# calculate MAE on housing data
from sklearn.metrics import mean_absolute_error

y_pred=linreg.predict(X_test)

mean_absolute_error(y_test, y_pred)



51604.687324832186