In [1]:
import pandas as pd
diamonds= pd.read_csv("../input/diamonds.csv")
print(diamonds.head()) 


## Replace cut, color, and clarity columns with numerical rankings.

In [2]:
replace_dict_cut = {"Fair":1,"Good":2, "Very Good":3, "Premium":4, "Ideal":5}
diamonds["cut"] = diamonds["cut"].map(replace_dict_cut)
print(diamonds.head()) 

In [3]:
replace_dict_color = {"J":1, "I":2, "H":3, "G":4, "F":5, "E":6, "D":7}
diamonds["color"] =diamonds["color"].map(replace_dict_color)
print(diamonds.head()) 

In [4]:
replace_dict_clarity = {"I1":1,"SI2":2, "SI1":3, "VS2":4, "VS1":5, "VVS2":6, "VVS1":7, "IF":8}
diamonds["clarity"]=diamonds["clarity"].map(replace_dict_clarity)
print(diamonds.head()) 

## The cut, color, and clarity scales are now set to numeric rankings, with 1 being the lowest and the next higher ranking increasing by 1, and so on.


In [5]:
columns=["cut", "color", "clarity", "price"]
for c in columns:
    diamonds[c]=diamonds[c].astype(float) 
print(diamonds.head()) 

## Now all of the numeric columns have been converted to floats.

In [6]:
diamonds["carat"].value_counts().sort_values(ascending=False) 

## It looks like most of the diamonds have a lower carat value. Let's take a look at price to explore the data.

In [8]:
diamonds["price"].value_counts().sort_values(ascending=False) 

## Again it looks like there are more diamonds in the 500-900 dollar range than more expensive diamonds in our dataset.

In [10]:
diamonds.describe() 

## The mean price is very high, as well as the standard deviation for the price column, suggesting that our dataset is very skewed by the few larger values for price. Between the 75th percentile and the max value, there is a large jump in value from 5,324 to 18,823 dollars. It seems that it would be a good idea to split this dataframe into separate tables based on price in order to examine it in more detail and find any strong correlations between features and price that would hold true across the lower range and the higher range of prices (Oh, and by the way, X, Y, and Z represent length, width and depth(in that order) in millimeters (mm) of the diamonds.

In [11]:
diamonds_price = diamonds.sort_values('price', ascending=False) 
print(diamonds_price[0:100]) 


In [12]:
diamonds_low = diamonds_price[diamonds_price['price'] <= 950]
diamonds_m =diamonds_price[diamonds_price['price'] > 950]
diamonds_high =diamonds_price[diamonds_price['price'] >=5324]
diamonds_medium=diamonds_m[diamonds_m['price'] < 5324]
print(diamonds_low.head())
print(diamonds_medium.head()) 
print(diamonds_high.head()) 

## Now I have separated the original dataframe into three separate tables based on price of the diamonds. The diamonds_low dataframe represents the lower 25th percentile. The diamonds_medium dataframe represents the 25th to 75th percentiles, or the middle 50% of the data. The diamond_high dataframe represents the diamonds in the top 25%, based on price. 

In [13]:
diamonds_low.describe() 

In [14]:
diamonds_medium.describe() 

In [15]:
diamonds_high.describe() 

## Separating the original dataframe in this way allows me to see some patterns in the data. First of all, the mean length, width, and depth of the diamonds steadily increased with price. The diamonds costing the least had a mean length and width of 4mm. The medium-priced diamonds have a mean length and width of 5 mm. The highest priced diamonds have a mean length and width of 7 mm.  The mean value for carats also steadily increased with price. The most surprising result is that cut, color, and clarity were not good indicators for price. In fact, cut, color, and clarity were ranked better for cheaper diamonds.

In [16]:
diamonds_low.isnull().sum()



In [17]:
diamonds_medium.isnull().sum() 

In [18]:
diamonds_high.isnull().sum() 

In [19]:
price_col = diamonds['price']
diamonds = (diamonds - diamonds.min())/(diamonds.max() - diamonds.min())
diamonds['price'] = price_col
diamonds.head(40) 

## I just normalized all the data in the original dataframe (except the price column) so that I can continue to use KMeans  on the four best features I have found to predict price: x,y,z, (length, width,depth) and carat. We will find out which of these features predicts the price of a diamond the best. 

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

def knn_train_test(train_col, target_col, df):
    knn = KNeighborsRegressor()
    np.random.seed(1)
 
    shuffled_index = np.random.permutation(df.index)
    rand_df = df.reindex(shuffled_index)
    last_train_row = int(len(rand_df) / 2)
    train_df = rand_df.iloc[0:last_train_row]
    test_df = rand_df.iloc[last_train_row:]
    knn.fit(train_df[[train_col]], train_df[target_col])
    predicted_labels = knn.predict(test_df[[train_col]])
    mse = mean_squared_error(test_df[target_col], predicted_labels)
    rmse = np.sqrt(mse)
    return rmse

rmse_results = {}
train_cols = diamonds.columns.drop('price')
for col in train_cols:
    rmse_val = knn_train_test(col, 'price', diamonds)
    rmse_results[col] = rmse_val
    
rmse_results_series = pd.Series(rmse_results)
rmse_results_series.sort_values()

## As predicted, the x,y,z, and carat columns were the best predictors of price. Specifically, the width of a diamond was the best predictor of price across the entire original dataset.