In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
filepath = '../input/diamonds/diamonds.csv'
diamond_data = pd.read_csv(filepath, index_col=0)

In [None]:
diamond_data.columns

In [None]:
diamond_data.head()
diamond_data.isnull().sum()

Now looking at the qualitative data...

In [None]:
from sklearn.preprocessing import LabelEncoder

dataTypes = (diamond_data.dtypes == 'object')
categories = list(dataTypes[dataTypes].index)

diamond_data_cat = diamond_data.copy()
label_encoder = LabelEncoder()

for category in categories:
    diamond_data_cat[category] = label_encoder.fit_transform(diamond_data_cat[category])
    
diamond_data_cat.head()

In [None]:
data_correlation = diamond_data_cat.corr(method='pearson')
mask = np.triu(np.ones_like(data_correlation, dtype=np.bool))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
plt.figure(figsize=(30, 10))
sns.heatmap(data_correlation, cmap=cmap, vmax=.3, center=0, linewidths=.5, cbar_kws={"shrink": .5}, annot=True)

In [None]:
unstacked_correlation = data_correlation.abs().unstack()
sorted_correlation = unstacked_correlation.sort_values(kind='quicksort', ascending=False)
sorted_correlation['price']

In [None]:
plt.figure(figsize=(20,10))
sns.regplot(data=diamond_data, x='carat', y='price')


In [None]:
plt.figure(figsize=(20,10))
sns.regplot(data=diamond_data, x='x', y='price')

In [None]:
plt.figure(figsize=(20,10))
sns.regplot(x=np.log(diamond_data['x']), y=np.log(diamond_data['price']))

Coding the Model:

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
#Select Columns here:
X_columns = ['carat', 'x', 'y', 'z']
y_columns = ['price']
X = diamond_data_cat[X_columns]
y = diamond_data[y_columns]

#splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

In [None]:
model = LinearRegression().fit(X_train, np.log(y_train))
predictions = model.predict(X_test)
X_test

In [None]:
stats_df = pd.DataFrame(X_test.copy())
stats_df['price'] = y_test['price'].copy()
stats_df['predictions'] = predictions.copy()
stats_df['predictions'] = np.exp(stats_df['predictions'])
stats_df.sort_values(by='carat')

In [None]:
from sklearn.metrics import mean_squared_error
import math

#Finding the RMSE of the model
mse = mean_squared_error(predictions, y_test['price'])
rmse = math.sqrt(mse)
rmse

Getting the log values of price and inserting it to the prediction removes negative values. However, it inflates the overall predictions making it inaccurate in general. The code below shows a lower RMSE with negative values on the table.

In [None]:
second_model = LinearRegression().fit(X_train, y_train)
second_predictions = second_model.predict(X_test)
second_metric = mean_squared_error(second_predictions, y_test['price'])
second_metric = math.sqrt(second_metric)
second_metric

In [None]:
second_stats_df = pd.DataFrame(X_test.copy())
second_stats_df['price'] = y_test['price'].copy()
second_stats_df['predictions'] = second_predictions.copy()
second_stats_df['predictions'] = second_stats_df['predictions']
second_stats_df.sort_values(by='carat')

Personal Skills Gained:
-As someone new, I familiarized myself with Categorical Feature labelling and feature selection. Using the correlation heatmap helped me find relevant features more effectively.

My Takeaways:
-Linear Regression fits "ok" with the data. But, it doesn't accurately predict the ends of the values of each feature.

Adjustments that can be done:
- remove carat values that are lesser than 0.5
