In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing relevant libraries

In [None]:
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Loading Data

In [None]:
raw_data = pd.read_csv('/kaggle/input/diamonds/diamonds.csv')
raw_data.head()

In [None]:
raw_data.info()

### Dropping additional index column

In [None]:
raw_data = raw_data.drop(['Unnamed: 0'], axis=1)
raw_data.head()

# EDA and Preprocessing

### Exploring descriptive statistics of the data

In [None]:
raw_data.describe(include='all')

**Inferences based on descriptive statistics**
* count of all the variables is same, hence there are no Null values.
* However, the min values for 'x', 'y' and 'z' is 0 which is practically impossible.
* The mean of 'carat' is 0.79 and 75% of values are under 1.04, while the max is 5.01. This implies that there might be some outliers.
* Similarly, the max value for variables 'x', 'y' and 'z' is very high than their respective 75% quantile value which affects the mean and implies that we might have outliers.

### Delaing with 0 in 'x', 'y' and 'z'

In [None]:
print('Count of x=0:',(raw_data['x']==0).sum(), '\nCount of y=0:',(raw_data['y']==0).sum(),\
      '\nCount of z=0:',(raw_data['z']==0).sum())

The total number of rows with value as 0 for 'x', 'y' and 'z' is very low copared to the observations in original dataset. Hence, dropping the observations having value 0.

In [None]:
data_no_null = raw_data.drop(raw_data[raw_data['x']==0].index)
data_no_null = data_no_null.drop(data_no_null[data_no_null['y']==0].index)
data_no_null = data_no_null.drop(data_no_null[data_no_null['z']==0].index)

In [None]:
print('Count of x=0:',(data_no_null['x']==0).sum(), '\nCount of y=0:',(data_no_null['y']==0).sum(),\
      '\nCount of z=0:',(data_no_null['z']==0).sum())

In [None]:
print('Min of x=0:',min(data_no_null['x']), '\nMin of y=0:',min(data_no_null['y']),\
      '\nMin of z=0:',min(data_no_null['z']))

### Exploring PDFs of all the numerical variables

### Checking and Relaxing OLS assumptions for Linear Regression

**1. Linearity**

In [None]:
sns.pairplot(data=data_no_null,
             x_vars=(data_no_null.drop(['clarity', 'cut', 'color', 'price'], axis=1)).columns,
             y_vars=['price'],
             kind='scatter')

The relation between carat-->price and x-->price is exponential. Hence, we'll perform logarithmic transformation on price to obtain linearity.

In [None]:
log_price = np.log(data_no_null['price'])
data_no_null['log_price'] = log_price

sns.pairplot(data=data_no_null,
             x_vars=(data_no_null.drop(['clarity', 'cut', 'color', 'price','log_price'], axis=1)).columns,
             y_vars=['price', 'log_price'],
             kind='scatter')

In [None]:
data_linear = data_no_null.drop(['price'], axis=1)

**2. No Endogeneity**

No Endogeneity refers to the prohibition of correlation between the error term and the independent variables. We should take of Omitted Variable Bias(OBS) as any omitted variable eventually adds to the error term resulting in correlation between error term and the independent variables.

**3. Normality and Homoscedasticity**

* Normality is assumed for big samples following Central Limit Theorem
* Zero mean of distribution of error is accomplished by the intercept
* We can have verified normality of error term and homoscedasticity later while predicting the results

**4. No auto-correlation**

The observations are not coming from a Time Series or panel data, hence there is no auto-correlation.
Each observation represents a different diamond and hence they are not correlated.

**5. No Multi-Collinearity**

### Creating Dummy Variables

In [None]:
data_with_dummies = pd.get_dummies(data_linear, drop_first=True)
data_with_dummies.head()

In [None]:
data_preprocessed = data_with_dummies.reset_index(drop=True)

# Linear Regression

### Declaring dependent and independent variables

In [None]:
x = data_preprocessed.drop(['log_price'], axis=1)
y = data_preprocessed['log_price']

### Scaling the data

In [None]:
scaler = StandardScaler()
scaler.fit(x)
x_scaled = scaler.transform(x)

### Splitting the data in train-test

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.2, random_state=420)

### Regression

In [None]:
reg = LinearRegression()
reg.fit(x_train, y_train)

In [None]:
print('R-squared =',reg.score(x_train, y_train))

In [None]:
print('Intercept =',reg.intercept_)

In [None]:
weights = pd.DataFrame(x.columns, columns=['Feature'])
weights['Weight'] = reg.coef_
print('Weight of each variable is:')
weights

# Prediction and Inferences based on Train Data

In [None]:
y_hat_train = reg.predict(x_train)

**Ideally we want the predicted values to be same as the actual observation. Hence, a scatter plot of actual values VS predicted values should be formed around a 45° line.**

In [None]:
plt.scatter(np.exp(y_train), np.exp(y_hat_train), alpha=0.2)
plt.xlabel('Actual log_Price', size=20)
plt.ylabel('Predicted log_Price', size=20)
plt.ylim(0,30000)
plt.xlim(0,20000)
plt.plot([0,30000], [0,30000], c='orange', label='45° line')
plt.show()

The variance of the error should be constant throughout. Here the error seems to be Hetroscedastic and violates the Homoscedaticity assumsption of the OLS algorithm we are using for Linear Regression. We'll need to further work on the model to resolve this.

**The residuals should be normally distributed with a mean of 0.**

In [None]:
sns.displot(y_train-y_hat_train, kde=True)

# Testing of the test data

In [None]:
test = pd.DataFrame()
y_test.reset_index(inplace=True, drop=True)
test['Actual'] = np.exp(y_test)
test['Predicted'] = np.exp(reg.predict(x_test))
test

In [None]:
plt.scatter(test['Actual'], test['Predicted'], alpha=0.2)
plt.xlabel('Actual log_Price', size=20)
plt.ylabel('Predicted log_Price', size=20)
plt.ylim(0,25000)
plt.ylim(0,25000)
plt.plot([0,25000], [0,25000], c='orange', label='45° line')
plt.show()

In [None]:
sns.displot(test['Actual']-test['Predicted'], kde=True)

Our model is over-estimating values as we can see negative residuals. While there seems to be no issue of under-estimation for the model.

In [None]:
pd.set_option('display.max_row', None)
pd.set_option('display.float_format', lambda x: '%.2f' %x)
test['Residual'] = test['Actual']-test['Predicted']
test['Difference%'] = np.absolute(test['Residual']/test['Actual']*100)
test.sort_values(by=['Difference%'])

In [None]:
np.mean(test['Difference%'])

The Difference% is relatively low for the lower obersavation values of price but gradually increases as the observation's price increases. We'll need to work on this to improve the model.