In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as st
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

  import pandas.util.testing as tm


In [2]:
df = pd.read_csv('diamonds_train.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,0.32,Ideal,D,VS2,61.9,55.0,4.43,4.39,2.73,972
1,1,0.4,Premium,F,VS2,59.7,58.0,4.79,4.83,2.87,842
2,2,0.54,Very Good,D,VS2,61.1,59.0,5.25,5.29,3.22,1583
3,3,0.51,Ideal,G,VS1,61.7,55.0,5.17,5.14,3.18,1781
4,4,1.01,Very Good,F,SI2,62.8,60.0,6.33,6.4,4.0,4416


In [3]:
df.drop('Unnamed: 0',inplace=True,axis=1)


In [4]:
df[df.x == 0]

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
6115,0.71,Good,F,SI2,64.1,60.0,0.0,0.0,0.0,2130
8105,2.25,Premium,H,SI2,62.8,59.0,0.0,0.0,0.0,18034
16830,1.0,Very Good,H,VS2,63.3,53.0,0.0,0.0,0.0,5139
20877,1.14,Fair,G,VS1,57.5,67.0,0.0,0.0,0.0,6381
30276,1.07,Ideal,F,SI2,61.6,56.0,0.0,6.62,0.0,4954
33129,1.56,Ideal,G,VS2,62.2,54.0,0.0,0.0,0.0,12800


In [5]:
df[['x','y','z']]=df[['x','y','z']].replace(0,np.NaN)


In [6]:
def outliers(var):
    a = []
    q1 = df[var].quantile(.25)
    q2 = df[var].quantile(.5)
    q3 = df[var].quantile(.75)
    iqr = q3-q1
    ulim = float(q3+(1.5*iqr))
    llim = float(q1-(1.5*iqr))

    for i in df[var]:
        if i > ulim:
            i=np.NaN
        elif i < llim:
            i = np.NaN
        else:
            i=i
        a.append(i)
    return a

for col in df.select_dtypes(exclude='object').columns:
    df[col] = outliers(col)

In [7]:
for i in df.select_dtypes(exclude='object').columns:
    df[i]=df[i].fillna(df[i].mean())

In [8]:
df_cat = df.select_dtypes(include='object')
df_cat['cut'].value_counts()

Ideal        16089
Premium      10333
Very Good     9025
Good          3693
Fair          1205
Name: cut, dtype: int64

In [9]:
le = LabelEncoder()
df_cat = df_cat.apply(le.fit_transform)
df_cat

Unnamed: 0,cut,color,clarity
0,2,0,5
1,3,2,5
2,4,0,5
3,2,3,4
4,4,2,3
...,...,...,...
40340,0,1,2
40341,2,4,4
40342,2,0,2
40343,4,1,4


In [10]:
df = df.drop(df_cat,axis=1)


In [11]:
df = pd.concat([df,df_cat],axis=1)

In [12]:
X = df.drop('price',axis=1)
y = df['price']

In [13]:
vif = [variance_inflation_factor(X.values,col) for col in range(0,X.shape[1])]

In [14]:
pd.DataFrame({'vif':vif,'cols':X.columns})

Unnamed: 0,vif,cols
0,22.436189,carat
1,719.539252,depth
2,719.39541,table
3,7410.587253,x
4,7540.343231,y
5,1736.117242,z
6,7.825685,cut
7,3.605161,color
8,6.291823,clarity


In [15]:
X = df.drop(['price'],axis=1)
y = df['price']

In [16]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.33,random_state=42)


In [23]:
rf = RandomForestRegressor(n_estimators=250,
                         min_samples_split=5,
                         min_samples_leaf=1,
                         max_depth=50)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)


In [24]:
import math

In [25]:
math.sqrt(mean_squared_error(y_true = y_test, y_pred = y_pred))

821.8544636996465

In [22]:
r2_score(y_test,y_pred)

0.9035533272805882