<a href="https://colab.research.google.com/github/sensei-jirving/Online-DS-PT-01.24.22-cohort-notes/blob/main/Week_05/Lecture_02/CodeAlong/Solution_CodeAlong_Diamonds.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

In [None]:
# Load in the data
df = pd.read_csv('/content/diamonds - diamonds.csv')

#https://docs.google.com/spreadsheets/d/10jyAyqpbuZHrbT1g-j5g6bQTbNT8jBnfBx-QkExPM8Y/edit#gid=88799590

In [None]:
# Take a look at the first five rows of the dataframe
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326.0,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326.0,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327.0,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334.0,4.2,4.23,2.63
4,5,,Good,J,SI2,63.3,58.0,335.0,4.34,4.35,2.75


In [None]:
# Look at the info from the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  53940 non-null  int64  
 1   carat       53920 non-null  float64
 2   cut         53919 non-null  object 
 3   color       53932 non-null  object 
 4   clarity     53936 non-null  object 
 5   depth       53934 non-null  float64
 6   table       53935 non-null  float64
 7   price       53934 non-null  float64
 8   x           53935 non-null  float64
 9   y           53934 non-null  float64
 10  z           53934 non-null  float64
dtypes: float64(7), int64(1), object(3)
memory usage: 4.5+ MB


In [None]:
# Check to see if there are any duplicates
df.duplicated().sum()

0

In [None]:
# split X and y, you are predicting price
X = df.drop(columns=['price', 'Unnamed: 0'])
y = df['price']

# split training and test
# set random_state to 42 for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
# instantiate the column selectors
num_selector = make_column_selector(dtype_include='number')
cat_selector = make_column_selector(dtype_include='object')

In [None]:
#instantiate the standard scaler, OneHotEncoder and Imputers
scaler = StandardScaler()
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

mean_imputer = SimpleImputer(strategy='mean')
freq_imputer = SimpleImputer(strategy='most_frequent')

In [None]:
# Setup the pipelines for the numeric and categorical data

num_processor = make_pipeline(mean_imputer, scaler)

cat_processor = make_pipeline(freq_imputer, encoder)

In [None]:
# Setup the tuples to pair the processors with the make column selecotrs

num_tuple = (num_processor, num_selector)

cat_tuple = (cat_processor, cat_selector)

In [None]:
# Instantiate the make column transformer

col_transformer = make_column_transformer(num_tuple, cat_tuple, remainder='passthrough')

In [None]:
# Fit the column transformer on the X_train

X_train_processed = col_transformer.fit(X_train)

In [None]:
# Get out the column transformer steps

col_transformer.named_transformers_

{'pipeline-1': Pipeline(steps=[('simpleimputer', SimpleImputer()),
                 ('standardscaler', StandardScaler())]),
 'pipeline-2': Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')),
                 ('onehotencoder',
                  OneHotEncoder(handle_unknown='ignore', sparse=False))])}

In [None]:
# Reference the pipeline that has the one hot encoder

cat_pipe = col_transformer.named_transformers_['pipeline-2']

In [None]:
# Display the feature names out from the one hot encoder

cat_pipe.named_steps['onehotencoder'].get_feature_names_out(cat_selector(X_train))

array(['cut_Fair', 'cut_Good', 'cut_Ideal', 'cut_Premium',
       'cut_Very Good', 'color_D', 'color_E', 'color_F', 'color_G',
       'color_H', 'color_I', 'color_J', 'clarity_I1', 'clarity_IF',
       'clarity_SI1', 'clarity_SI2', 'clarity_VS1', 'clarity_VS2',
       'clarity_VVS1', 'clarity_VVS2'], dtype=object)

In [None]:
# Extract the feature names from one hot encoder
cat_feature_names = col_transformer.named_transformers_['pipeline-2']\
                            .named_steps['onehotencoder']\
                            .get_feature_names_out(cat_selector(X_train))
cat_feature_names

array(['cut_Fair', 'cut_Good', 'cut_Ideal', 'cut_Premium',
       'cut_Very Good', 'color_D', 'color_E', 'color_F', 'color_G',
       'color_H', 'color_I', 'color_J', 'clarity_I1', 'clarity_IF',
       'clarity_SI1', 'clarity_SI2', 'clarity_VS1', 'clarity_VS2',
       'clarity_VVS1', 'clarity_VVS2'], dtype=object)

In [None]:
## get final col names which are num features + cat features
final_cols = num_selector(X_train) + list(cat_feature_names)
final_cols

['carat',
 'depth',
 'table',
 'x',
 'y',
 'z',
 'cut_Fair',
 'cut_Good',
 'cut_Ideal',
 'cut_Premium',
 'cut_Very Good',
 'color_D',
 'color_E',
 'color_F',
 'color_G',
 'color_H',
 'color_I',
 'color_J',
 'clarity_I1',
 'clarity_IF',
 'clarity_SI1',
 'clarity_SI2',
 'clarity_VS1',
 'clarity_VS2',
 'clarity_VVS1',
 'clarity_VVS2']

In [None]:
# Transform the X_train and the X_test

X_train_transformed = col_transformer.transform(X_train)

X_test_transformed = col_transformer.transform(X_test)

In [None]:
# Change the X_train and X_test transformed columns to a dataframe

X_train_output = pd.DataFrame(X_train_transformed, columns = final_cols)

X_test_output = pd.DataFrame(X_test_transformed, columns = final_cols)

In [None]:
# View the first five rows of the X_train transformed dataframe

X_train_output.head()

Unnamed: 0,carat,depth,table,x,y,z,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,-1.156764,2.207835,0.242456,-1.590143,-1.544522,-1.365942,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.086861,0.038503,-0.654927,0.273488,0.291459,0.282087,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.529507,-0.451347,0.242456,0.737166,0.676141,0.63423,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.466271,-0.731261,-0.654927,0.710416,0.667398,0.577887,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,-0.397942,0.038503,-0.206236,-0.270443,-0.233107,-0.239085,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [None]:
# View the first five rows of the X_test transformed dataframe

X_test_output.head()

Unnamed: 0,carat,depth,table,x,y,z,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,-1.177842,0.248438,-0.654927,-1.57231,-1.518293,-1.506799,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.461177,-1.22111,-0.206236,-0.261526,-0.276821,-0.394027,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.840588,0.248438,-1.103619,-0.867875,-0.871329,-0.830684,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.777353,-0.661282,-0.206236,-0.725205,-0.740187,-0.788427,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.583426,0.388395,-1.103619,1.521853,1.428018,1.507544,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [None]:
# View the info from the X_train transformed dataframe

X_train_output.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 26 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   carat          40455 non-null  float64
 1   depth          40455 non-null  float64
 2   table          40455 non-null  float64
 3   x              40455 non-null  float64
 4   y              40455 non-null  float64
 5   z              40455 non-null  float64
 6   cut_Fair       40455 non-null  float64
 7   cut_Good       40455 non-null  float64
 8   cut_Ideal      40455 non-null  float64
 9   cut_Premium    40455 non-null  float64
 10  cut_Very Good  40455 non-null  float64
 11  color_D        40455 non-null  float64
 12  color_E        40455 non-null  float64
 13  color_F        40455 non-null  float64
 14  color_G        40455 non-null  float64
 15  color_H        40455 non-null  float64
 16  color_I        40455 non-null  float64
 17  color_J        40455 non-null  float64
 18  clarit

In [None]:
# View the info from the X_test transformed dataframe

X_test_output.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13485 entries, 0 to 13484
Data columns (total 26 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   carat          13485 non-null  float64
 1   depth          13485 non-null  float64
 2   table          13485 non-null  float64
 3   x              13485 non-null  float64
 4   y              13485 non-null  float64
 5   z              13485 non-null  float64
 6   cut_Fair       13485 non-null  float64
 7   cut_Good       13485 non-null  float64
 8   cut_Ideal      13485 non-null  float64
 9   cut_Premium    13485 non-null  float64
 10  cut_Very Good  13485 non-null  float64
 11  color_D        13485 non-null  float64
 12  color_E        13485 non-null  float64
 13  color_F        13485 non-null  float64
 14  color_G        13485 non-null  float64
 15  color_H        13485 non-null  float64
 16  color_I        13485 non-null  float64
 17  color_J        13485 non-null  float64
 18  clarit