## 1: Upload Data

In [None]:
import pandas as pd
import os
import numpy as np

In [None]:
diamond = pd.read_csv('/kaggle/input/diamonds/diamonds.csv')
diamond.head()
# can drop first column (just index)

In [None]:
diamond.drop(labels = 'Unnamed: 0', axis = 1, inplace = True)

In [None]:
diamond.head()

In [None]:
diamond.isnull().sum()
# no null values

## 2: Data Description (from kaggle)

price price in US dollars (326--18,823)

carat weight of the diamond (0.2--5.01)

cut quality of the cut (Fair, Good, Very Good, Premium, Ideal)

color diamond colour, from J (worst) to D (best)

clarity a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))

x length in mm (0--10.74)

y width in mm (0--58.9)

z depth in mm (0--31.8)

depth total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)

table width of top of diamond relative to widest point (43--95)

In [None]:
# looks like I am going to make a regression model to find the price of a diamond

# numeric data:
#   price
#   carat
#   x-length
#   y-width
#   z-depth
#   depth
#   table width


# categorical data:
#   cut
#   color
#   clarity


## 3: Quick Look at Data

In [None]:
diamond.info()

# there is no null-values in the data!!

In [None]:
diamond.describe()

#only for numeric

In [None]:
# lets look at the value counts for the categorical data attributes

diamond.cut.value_counts()/len(diamond.cut)

In [None]:
diamond.color.value_counts()/ len(diamond.cut)

In [None]:
diamond.clarity.value_counts()/len(diamond.cut)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
diamond.hist(bins = 50, figsize = (20,15))
plt.show()

In [None]:
# x and carat aren't that close to a normal distribution
# everything else ins't that bad, just needs to be scaled
# table might be a little tail heavy but just barely

# interesting that price isn't normally distributed, has a very long tail



## 4: Create Training and Test Sets

In [None]:
# don't know if I should use train_test_split or do stratefied
# have to see if dataset is large enough
len(diamond)

# I think this is large enought, plus I don't really know if there are any super important attributes

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(diamond, test_size = 0.2, random_state = 42)

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

## 5: Further Data Exploration / Visualization

In [None]:
diamond_corr = diamond.corr()
diamond_corr['price'].sort_values(ascending=False)

# can see that table and depth are pretty weakly correlated
# want to check for categorical data


In [None]:
diamond.plot(kind = 'scatter', x = 'carat', y='price')

In [None]:
# need to transform categorical data first in order to explore it

In [None]:
# lets see if any data combinations will give us anything valueable

diamond['table_depth'] = diamond.table/diamond.depth
diamond['volume'] = diamond.x * diamond.y * diamond.z
diamond['density'] = diamond.carat / diamond.volume


In [None]:
diamond_corr_new = diamond.corr()
diamond_corr_new['price'].sort_values(ascending = False)

# volume seems to have a high correlation!


## 6: Data Cleaning / Processing

In [None]:
# start of with copying a clean training set

diamond = train.drop('price', axis = 1)
diamond_labels = train['price'].copy()

In [None]:
diamond.head()
diamond_labels.head()


## 7: Categorical Attributes

In [None]:
# the categorical data I have is very clearnly ordinal
# cut, color, and clarity, all go from worst to best 

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
diamond_cat = diamond[['cut', 'color', 'clarity']]
diamond_cat.head(10)

In [None]:
ordinal_encoder = OrdinalEncoder(categories = [['Fair', 'Good', 'Very Good', 'Premium', 'Ideal'], 
                                               ['J', 'I', 'H', 'G', 'F', 'E', 'D'],
                                              ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']])
diamond_cat_encoded = ordinal_encoder.fit_transform(diamond_cat)


In [None]:
diamond_cat_encoded[:10]

# I encoded the ordinal categorical data!!

In [None]:
# I want to look at the encoded data visually and correlations

diamond_cat_df = pd.DataFrame(data=diamond_cat_encoded, columns = ['cut', 'color', 'clarity'])
diamond_cat_df.head()

In [None]:
diamond_labels_df = pd.DataFrame(data=diamond_labels, columns = ['price'])
diamond_labels_df = diamond_labels_df.reset_index()
diamond_labels_df.drop('index', axis=1, inplace=True)
diamond_labels_df.head()

In [None]:
diamond_cat_explore = diamond_cat_df.merge(diamond_labels_df, how = 'left', left_index = True, right_index = True)
diamond_cat_explore.head()

In [None]:
diamond_cat_explore.corr()['price']
# interesting


## 8: Transformations

In [None]:
# make a tranformer that ads the volume attribute

from sklearn.base import BaseEstimator, TransformerMixin

class VolumeAdder(BaseEstimator, TransformerMixin):
    def __init__(self, hyper = None):
        self.hyper = hyper
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        volume = X['x'] * X['y'] * X['z']
        return np.c_[X, volume]
    

In [None]:
add_vol = VolumeAdder()
diamond_vol = add_vol.transform(diamond)
diamond_vol

## 9: Scaling / Transformation Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [None]:
diamond = train.drop('price', axis = 1)


In [None]:
diamond_num = diamond.drop(['cut', 'color', 'clarity'], axis =1 , inplace = False)
diamond_cat = diamond.drop(list(diamond_num), axis = 1, inplace = False)

In [None]:
# start of by making pipeline for numerican data

num_pipeline = Pipeline([
    ('vol_adder', VolumeAdder()),
    ('scaler', StandardScaler())
])

In [None]:
num_labels = list(diamond_num)
num_labels.append('volume')
num_labels

In [None]:
diamond_num_prep = num_pipeline.fit_transform(diamond_num)
diamond_num_prep_df = pd.DataFrame(diamond_num_prep, columns = num_labels)
diamond_num_prep_df

In [None]:
# full pipeline

num_attribs = list(diamond_num)
cat_attribs = list(diamond_cat)

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', ordinal_encoder, cat_attribs)
])

In [None]:
full_labels = num_labels + ['cut', 'color', 'clarity']

In [None]:
diamond_prep = full_pipeline.fit_transform(diamond)
diamond_prep

In [None]:

diamond_prep_df = pd.DataFrame(diamond_prep, columns = full_labels )

In [None]:
diamond_prep_df

## 10: Model Selection / Training

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [None]:
from scipy import stats
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [None]:
# time to try out a bunch of different models and see what we get

In [None]:
# linear regression

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(diamond_prep, diamond_labels)
lin_pred = lin_reg.predict(diamond_prep)
lin_rmse = np.sqrt(mean_squared_error(lin_pred, diamond_labels))
lin_rmse

In [None]:
# cross validation score

In [None]:
lin_scores = cross_val_score(lin_reg, diamond_prep, diamond_labels, scoring = 'neg_mean_squared_error', cv=5)
lin_rmse = np.sqrt(-lin_scores)
lin_rmse.mean()

In [None]:
# lets make a function that lets you put in the algorim and spits out cv_scores, mean, std

In [None]:
def get_scores(algorithm, data_prep, data_labels, n):
    scores = cross_val_score(algorithm, data_prep, data_labels,
                             scoring = 'neg_mean_squared_error',cv = n )
    rmse_scores = np.sqrt(-scores)
    print('Scores:', rmse_scores)
    print('Mean:', rmse_scores.mean())
    print('Std:', rmse_scores.std())

In [None]:
get_scores(lin_reg, diamond_prep, diamond_labels, 5)

In [None]:
# logistic regression takes toooooo long!

In [None]:
#  Decistion Tree

In [None]:
tree_reg = DecisionTreeRegressor().fit(diamond_prep, diamond_labels)
get_scores(tree_reg, diamond_prep, diamond_labels, 5)

In [None]:
# Random Forest

In [None]:
forest_reg = RandomForestRegressor(random_state=42).fit(diamond_prep, diamond_labels)
get_scores(forest_reg, diamond_prep, diamond_labels, 5)

In [None]:
# check training set vs validation sets for random forest (over or underfitting)

In [None]:
forest_rmse = np.sqrt(mean_squared_error(forest_reg.predict(diamond_prep),diamond_labels))
forest_rmse
# so forest is overfitting

In [None]:
# check rmse of decision tree
tree_rmse = np.sqrt(mean_squared_error(tree_reg.predict(diamond_prep),diamond_labels))
tree_rmse

# ??? really overfitting

In [None]:
# Support Vector Machine

svr_reg = SVR().fit(diamond_prep, diamond_labels)


In [None]:
svr_rmse = np.sqrt(mean_squared_error(svr_reg.predict(diamond_prep),diamond_labels))
svr_rmse
# thats fucking terrible

## 11: Model Tuning

In [None]:
# lets tune Random forest again, SVR wasn't even close, tree was hella overfitting, and linear is just bad

In [None]:
# randomized search

# look at current hyperparameters

forest_reg.get_params()

In [None]:
random_grid = {
    'n_estimators': stats.randint(low=1, high = 200),
    'max_features': stats.randint(low=1, high = 8),
    'bootstrap': [True, False]
}

In [None]:
forest_rand_search = RandomizedSearchCV(forest_reg, random_grid, n_iter = 5, cv=5, 
                                        scoring = 'neg_mean_squared_error', random_state = 42)
forest_rand_search.fit(diamond_prep, diamond_labels)

In [None]:
forest_rand_search.best_params_

In [None]:
cv_res = forest_rand_search.cv_results_
for mean_score, params in zip(cv_res['mean_test_score'], cv_res['params']):
    print(np.sqrt(-mean_score), params)

In [None]:
# now time to get feature importances

feature_importances = forest_rand_search.best_estimator_.feature_importances_
feature_importances

In [None]:
extra_attribs = ['volume','cut', 'color', 'clarity']
attribs = num_attribs + extra_attribs
sorted(zip(feature_importances,attribs), reverse=True)

In [None]:
# Im gonna try dropping the variables that are less than a tenth

diamond_prep_new = diamond_prep_df.drop(['depth','cut','table'], axis = 1, inplace = False)
diamond_prep_new

forest_reg_new = RandomForestRegressor(bootstrap = True, max_features = 5, n_estimators = 100, random_state = 42)
forest_reg_new.fit(diamond_prep_new, diamond_labels)

In [None]:
forest_new_rmse = np.sqrt(mean_squared_error(forest_reg_new.predict(diamond_prep_new),diamond_labels))
forest_new_rmse

In [None]:
get_scores(forest_reg_new, diamond_prep_new, diamond_labels, 5)

## 12: Prediction Time

In [None]:
final_model = forest_rand_search.best_estimator_

X_test = test.drop('price', axis = 1)
Y_test = test['price'].copy()

X_test_prep = full_pipeline.transform(X_test)


In [None]:
final_predictions = final_model.predict(X_test_prep)
final_rmse = np.sqrt(mean_squared_error(final_predictions,Y_test))
final_rmse

In [None]:
# see confidence interval

squared_errors = (final_predictions - Y_test)**2
np.sqrt(stats.t.interval(0.95, len(squared_errors)-1,
                        loc = squared_errors.mean(),
                        scale = stats.sem(squared_errors)))

# might want to make a function for this confidence interval next time, kinda a pain in the ass

# the 95% confidence interval is about a $20 difference

In [None]:
plt.scatter(final_predictions, Y_test)

# here we can see the graph of the actual vs predicted