In [None]:
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# read csv file and store as data frame in memory
df = pd.read_csv("diamonds.csv")

# Data analysis and preprocessing

In [None]:
shape = df.shape
print(f"The dataset has {shape[0]} rows and {shape[1]} columns.")

In [None]:
df.head(5)

In [None]:
df.dtypes

In [None]:
# some info about the numerical data (mean, max, min etc.)
df.describe()

In [None]:
df.columns

In [None]:
# rename columns
df = df.rename(columns={
    'depth': 'total_depth_percentage',
    'x': 'length',
    'y': 'width',
    'z': 'depth'
})
df.columns

In [None]:
# show duplicates based on all columns
df[df.duplicated(keep=False)]

In [None]:
# remove duplicates
df = df.drop_duplicates()

In [None]:
# count rows with missing values
rows_with_missing_values = df.isna().any(axis=1).sum()
print(rows_with_missing_values)

In [None]:
df.shape

# Visualization

In [None]:
import matplotlib.pylab as plt

# select stylesheet for matplotlib
plt.style.use("ggplot")
plt.tight_layout()

In [None]:
df.plot(kind='scatter', x='carat', y='price')
plt.show()

In [None]:
df.hist(bins=30, figsize=(20,15))
plt.show()

# Linear Regression with one variable

Linear Model predicting the price of the diamond based on weight in carat.

In [None]:
# split data into features
X = df[['carat']].values
y = df[['price']].values

In [None]:
# split the data frame into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)

In [None]:
# create and a linear model 
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

In [None]:
# calculate R2-score
y_test_pred = linear_model.predict(X_test)
r2 = r2_score(y_test, y_test_pred)
print(f"R-squared (R2): {r2}")

# Prepare data for linear regression with multiple variables and polynomial regression 

In [None]:
# drop 'unwanted' columns
df_filtered_for_size = df[['price', 'length', 'width', 'depth']]
df_filtered_for_size.head()

In [None]:
# split data into features
X = df_filtered_for_size.drop(labels=['price'], axis=1).values
y = df_filtered_for_size[['price']].values

In [None]:
# split the data frame into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Linear model with multiple independent variables

Predict price based on the length, width and depth.

In [None]:
# create and a linear model based on multiple independent variables
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

In [None]:
y_test_pred = linear_model.predict(X_test)
r2 = r2_score(y_test, y_test_pred)
print(f"R-squared (R2): {r2}")

# Polynomial regression with length, width and depth

In [None]:
from sklearn.preprocessing import PolynomialFeatures

pf = PolynomialFeatures(degree=2, include_bias=False)

In [None]:
# transform training data 
X_train_transformed = pf.fit_transform(X_train)
X_test_transformed = pf.fit_transform(X_test)

In [None]:
model_polynomial = LinearRegression()
model_polynomial.fit(X_train_transformed, y_train)

In [None]:
r2 = model_polynomial.score(X_test_transformed, y_test)
print(f"R-squared (R2): {r2}")