In [None]:
import pandas as pd
import seaborn as sns

In [None]:
data = pd.read_csv("../input/diamonds/diamonds.csv").dropna()
data.head()

Firstly let's do this data fully numerical.

In [None]:
cuts      = list(data["cut"].unique())
colors    = list(data["color"].unique())
clarities = list(data["clarity"].unique())

In [None]:
def markClass(Value="", Class=[]):
    for i in range(len(Class)):
        if(Class[i]==Value):
            return i

In [None]:
newData = pd.DataFrame(index=data.index, columns=data.columns)
newData.astype("float")
newData.head()

In [None]:
for i in list( set(data.columns)-set(["cut", "color", "clarity"]) ):
    newData[i] = data[i]
newData["cut"][0]=0
for i in data.index:
    for j in data.columns:
        if( j=="cut" ):
            newData[j][i] = markClass(data.loc[i][j], cuts)
        elif( j=="color" ):
            newData[j][i] = markClass(data.loc[i][j], colors)
        elif( j=="clarity" ):
            newData[j][i] = markClass(data.loc[i][j], clarities)
newData.head()

In [None]:
data = newData

In [None]:
data = data[ ["carat", "cut", "clarity", "depth", "table", "price", "x", "y", "z"] ]
data.head()

Ok, we have fully numerical data. **Let's begin analysis of data**.

Numerical data.

In [None]:
print("carat")
print("mean: "+str(data["carat"].mean()))
print("standart deviation: "+str(data["carat"].std()))
data["carat"].plot.hist(bins=15)

In [None]:
print("depth")
print("mean: "+str(data["depth"].mean()))
print("standart deviation: "+str(data["depth"].std()))
data["depth"].plot.hist(bins=25)

In [None]:
print("table")
print("mean: "+str(data["table"].mean()))
print("standart deviation: "+str(data["table"].std()))
data["table"].plot.hist(bins=25)

In [None]:
print("price")
print("mean: "+str(data["price"].mean()))
print("standart deviation: "+str(data["price"].std()))
data["price"].plot.hist(bins=20)

In [None]:
print("x")
print("mean: "+str(data["x"].mean()))
print("standart deviation: "+str(data["x"].std()))
data["x"].plot.hist(bins=20)

In [None]:
print("y")
print("mean: "+str(data["y"].mean()))
print("standart deviation: "+str(data["y"].std()))
data["carat"].plot.hist(bins=20)

Categorial data.

In [None]:
print("cut")
print(data["cut"].value_counts())
data["cut"].value_counts().plot.bar()

In [None]:
print("clarity")
print(data["clarity"].value_counts())
data["clarity"].value_counts().plot.bar()

General analysis.

In [None]:
print("Correlation matrix")
sns.heatmap(data.corr())

As we see, parameters "x", "y" and "z" are good correlated. So let's replace these parameters into one parameter "x+y+z".

In [None]:
x_y_z = data["x"]+data["y"]+data["z"]

data = data[ ["carat", "cut", "clarity", "depth", "table", "price"] ]
data["x+y+z"] = x_y_z
data.head()

In [None]:
print("Correlation matrix")
sns.heatmap(data.corr())

In [None]:
print("Correlation (R^2): carat-x+y+z")
print( data["carat"].corr(data["x+y+z"]) )

As we see parameters "carat" and "x+y+z" are well correlated. To build build better prediction model for predicting a price it would be good to reject one of these parameters. *To know what parameter we should reject let's build two regression models (in first model we will reject "carat" parameter and in second model we will reject "x+y+z" parameter) and compare these models. At the result **we will select better model**.*

In [None]:
X1 = data.drop(["carat", "price"], axis=1)
y1 = data["price"]
X2 = data.drop(["x+y+z", "price"], axis=1)
y2 = data["price"]

In [None]:
from sklearn import model_selection
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression

In [None]:
X1 = preprocessing.PolynomialFeatures(degree=3).fit_transform(X1)
X2 = preprocessing.PolynomialFeatures(degree=3).fit_transform(X2)

In [None]:
X1_train, X1_test, y1_train, y1_test = model_selection.train_test_split(X1, y1, test_size=0.25)
X2_train, X2_test, y2_train, y2_test = model_selection.train_test_split(X2, y2, test_size=0.25)
del X1
del y1
del X2
del y2

In [None]:
predModel1 = LinearRegression().fit(X1_train, y1_train)
predModel2 = LinearRegression().fit(X2_train, y2_train)

Ok, we have two regression models for prediction price. *First model uses "cut", "clarity", "depth", "table" and "x+y+z" parameters to predict price. Second model uses "carat", "cut", "clarity", "depth" and "table" parameters to predict price*. **Let's compare a quality of these models**.

In [None]:
from sklearn import metrics

In [None]:
print( "MSE for first model: "+str( metrics.mean_squared_error(y1_test, predModel1.predict(X1_test)) ) )
print( "MSE for second model: "+str( metrics.mean_squared_error(y2_test, predModel2.predict(X2_test)) ) )

In [None]:
print( "R2 for first model: "+str( metrics.r2_score(y1_test, predModel1.predict(X1_test)) ) )
print( "R2 for second model: "+str( metrics.r2_score(y2_test, predModel2.predict(X2_test)) ) )

As we see both metrics show that second regression model is better for prediction price. But as we see, using MSE, this prediction model is so bad. So let's try build a neural network to predict price and compare a quality.

In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
NW = MLPRegressor((3, 3), activation="tanh", max_iter=10000).fit(X2_train, y2_train)

In [None]:
print( "MSE for regression model: "+str( metrics.mean_squared_error(y2_test, predModel2.predict(X2_test)) ) )
print( "MSE for neural network: "+str( metrics.mean_squared_error(y2_test, NW.predict(X2_test)) ) )

In [None]:
print( "R2 for regression model: "+str( metrics.r2_score(y2_test, predModel2.predict(X2_test)) ) )
print( "R2 for neural network: "+str( metrics.r2_score(y2_test, NW.predict(X2_test)) ) )

As we see neural network, built using same data,gives worth result. So there is a reason to search other models to predict price to do other manipulations with data to have better result.