In [112]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

%matplotlib inline

In [113]:
diamondproject3 = pd.read_csv('datasets/diamonds.csv')
diamondproject3.head(200)

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
5,6,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48
6,7,0.24,Very Good,I,VVS1,62.3,57.0,336,3.95,3.98,2.47
7,8,0.26,Very Good,H,SI1,61.9,55.0,337,4.07,4.11,2.53
8,9,0.22,Fair,E,VS2,65.1,61.0,337,3.87,3.78,2.49
9,10,0.23,Very Good,H,VS1,59.4,61.0,338,4.00,4.05,2.39


In [114]:
diamondproject3.shape

(53940, 11)

In [115]:
diamondproject3.keys()

Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',
       'price', 'x', 'y', 'z'],
      dtype='object')

In [116]:
diamondproject3.dtypes

Unnamed: 0      int64
carat         float64
cut            object
color          object
clarity        object
depth         float64
table         float64
price           int64
x             float64
y             float64
z             float64
dtype: object

In [117]:
pd.isnull(diamondproject3)

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,False


In [118]:
pd.notnull(diamondproject3)

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,True,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,True
5,True,True,True,True,True,True,True,True,True,True,True
6,True,True,True,True,True,True,True,True,True,True,True
7,True,True,True,True,True,True,True,True,True,True,True
8,True,True,True,True,True,True,True,True,True,True,True
9,True,True,True,True,True,True,True,True,True,True,True


In [119]:
diamondproject3

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
5,6,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48
6,7,0.24,Very Good,I,VVS1,62.3,57.0,336,3.95,3.98,2.47
7,8,0.26,Very Good,H,SI1,61.9,55.0,337,4.07,4.11,2.53
8,9,0.22,Fair,E,VS2,65.1,61.0,337,3.87,3.78,2.49
9,10,0.23,Very Good,H,VS1,59.4,61.0,338,4.00,4.05,2.39


In [120]:
frame = pd.DataFrame(diamondproject3)

In [121]:
frame

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
5,6,0.24,Very Good,J,VVS2,62.8,57.0,336,3.94,3.96,2.48
6,7,0.24,Very Good,I,VVS1,62.3,57.0,336,3.95,3.98,2.47
7,8,0.26,Very Good,H,SI1,61.9,55.0,337,4.07,4.11,2.53
8,9,0.22,Fair,E,VS2,65.1,61.0,337,3.87,3.78,2.49
9,10,0.23,Very Good,H,VS1,59.4,61.0,338,4.00,4.05,2.39


In [122]:
diamondsubset = diamondproject3.drop(['Unnamed: 0', 'x', 'y', 'z'], axis=1)
diamondsubset

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,0.23,Ideal,E,SI2,61.5,55.0,326
1,0.21,Premium,E,SI1,59.8,61.0,326
2,0.23,Good,E,VS1,56.9,65.0,327
3,0.29,Premium,I,VS2,62.4,58.0,334
4,0.31,Good,J,SI2,63.3,58.0,335
5,0.24,Very Good,J,VVS2,62.8,57.0,336
6,0.24,Very Good,I,VVS1,62.3,57.0,336
7,0.26,Very Good,H,SI1,61.9,55.0,337
8,0.22,Fair,E,VS2,65.1,61.0,337
9,0.23,Very Good,H,VS1,59.4,61.0,338


In [123]:
print (diamondsubset.columns)

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price'], dtype='object')


In [124]:
diamondsubset.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,0.23,Ideal,E,SI2,61.5,55.0,326
1,0.21,Premium,E,SI1,59.8,61.0,326
2,0.23,Good,E,VS1,56.9,65.0,327
3,0.29,Premium,I,VS2,62.4,58.0,334
4,0.31,Good,J,SI2,63.3,58.0,335


In [125]:
diamondsubset.tail()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
53935,0.72,Ideal,D,SI1,60.8,57.0,2757
53936,0.72,Good,D,SI1,63.1,55.0,2757
53937,0.7,Very Good,D,SI1,62.8,60.0,2757
53938,0.86,Premium,H,SI2,61.0,58.0,2757
53939,0.75,Ideal,D,SI2,62.2,55.0,2757


In [126]:
diamondsubset['cut'].unique()

array(['Ideal', 'Premium', 'Good', 'Very Good', 'Fair'], dtype=object)

In [127]:
diamondsubset['color'].unique()

array(['E', 'I', 'J', 'H', 'F', 'G', 'D'], dtype=object)

In [128]:
diamondsubset['clarity'].unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'], dtype=object)

In [129]:
diamondsubset['cut'] = diamondsubset['cut'].map({'Ideal':0, 'Premium':1, 'Good':2, 'Very Good':3, 'Fair':4})
diamondsubset.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,0.23,0,E,SI2,61.5,55.0,326
1,0.21,1,E,SI1,59.8,61.0,326
2,0.23,2,E,VS1,56.9,65.0,327
3,0.29,1,I,VS2,62.4,58.0,334
4,0.31,2,J,SI2,63.3,58.0,335


In [130]:
diamondsubset['color'] = diamondsubset['color'].map({'E':0, 'I':1, 'J':2, 'H':3, 'F':4, 'G':5,'D':6})
diamondsubset.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,0.23,0,0,SI2,61.5,55.0,326
1,0.21,1,0,SI1,59.8,61.0,326
2,0.23,2,0,VS1,56.9,65.0,327
3,0.29,1,1,VS2,62.4,58.0,334
4,0.31,2,2,SI2,63.3,58.0,335


In [131]:
diamondsubset['clarity'] = diamondsubset['clarity'].map({'SI2':0, 'SI1':1, 'VS1':2, 'VS2':2, 'VVS2':3, 'VVS1':4, 
                                                         'I1':5, 'IF':6})
diamondsubset.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,0.23,0,0,0,61.5,55.0,326
1,0.21,1,0,1,59.8,61.0,326
2,0.23,2,0,2,56.9,65.0,327
3,0.29,1,1,2,62.4,58.0,334
4,0.31,2,2,0,63.3,58.0,335


In [132]:
from sklearn.linear_model import LinearRegression
X = diamondsubset.drop('cut', axis = 1)

#assign linear regression function to a variable
lm = LinearRegression()
lm

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [133]:
lm.fit(X, diamondsubset.cut)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [134]:
print('Estimated intercept coefficient:', lm.intercept_)

Estimated intercept coefficient: -30.0483406321


In [135]:
df.groupby(['carat', 'cut', 'clarity', 'depth', 'table'])['price'].mean()

carat  cut        clarity  depth  table
0.20   Ideal      VS2      59.7   55.0       367.0
                           61.5   57.0       367.0
                           62.2   57.0       367.0
       Premium    SI2      60.2   62.0       345.0
                  VS2      59.0   60.0       367.0
                           59.7   62.0       367.0
                           59.8   62.0       367.0
                           61.1   59.0       367.0
                           61.7   60.0       367.0
                           62.3   60.0       367.0
                           62.6   59.0       367.0
       Very Good  VS2      63.4   59.0       367.0
0.21   Premium    SI1      59.8   61.0       326.0
                  SI2      61.9   56.0       394.0
                  VS2      58.3   59.0       386.0
                           59.1   62.0       386.0
                           59.6   56.0       386.0
                           60.5   59.0       386.0
                           60.6   60.0    

In [136]:
y = diamondsubset['clarity']

In [137]:
z = diamondsubset.drop(['clarity'], axis=1)

In [138]:
X_train, X_test, y_train, y_test = train_test_split(z, y, test_size=0.2, random_state=15)

In [139]:
LogReg = LogisticRegression()
LogReg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [140]:
LogReg.score(X_train, y_train)

0.44489247311827956

In [141]:
y_pred = LogReg.predict(X_test)

In [146]:
#look at true and false predictions​pd.DataFrame(
pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    columns=['carat', 'cut', 'color', 'clarity','depth','table','price'],
    index=['carat', 'cut', 'color', 'clarity','depth','table','price'],
)

Unnamed: 0,carat,cut,color,clarity,depth,table,price
carat,928,19,903,0,0,0,0
cut,398,22,2169,0,0,1,0
color,221,19,3820,1,0,0,0
clarity,11,2,1054,1,0,0,0
depth,1,0,716,3,0,0,0
table,89,1,24,0,0,36,0
price,0,0,342,7,0,0,0


In [147]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.56      0.50      0.53      1850
          1       0.35      0.01      0.02      2590
          2       0.42      0.94      0.58      4061
          3       0.08      0.00      0.00      1068
          4       0.00      0.00      0.00       720
          5       0.97      0.24      0.39       150
          6       0.00      0.00      0.00       349

avg / total       0.36      0.45      0.32     10788



  'precision', 'predicted', average, warn_for)


In [148]:
diamondsubset['carat'].value_counts()

0.30    2604
0.31    2249
1.01    2242
0.70    1981
0.32    1840
1.00    1558
0.90    1485
0.41    1382
0.40    1299
0.71    1294
0.50    1258
0.33    1189
0.51    1127
0.34     910
1.02     883
0.52     817
1.51     807
1.50     793
0.72     764
0.53     709
0.42     706
0.38     670
0.35     667
1.20     645
0.54     625
0.36     572
0.91     570
1.03     523
0.55     496
0.73     492
        ... 
2.58       3
2.55       3
2.63       3
1.92       2
3.04       2
2.75       2
2.80       2
4.01       2
2.68       2
3.24       1
2.67       1
3.65       1
4.00       1
3.11       1
2.64       1
4.50       1
3.50       1
2.59       1
3.02       1
3.51       1
2.65       1
3.05       1
2.71       1
3.22       1
4.13       1
2.70       1
3.67       1
5.01       1
2.77       1
3.40       1
Name: carat, Length: 273, dtype: int64

In [151]:
from sklearn import tree
from sklearn.metrics import accuracy_score

In [152]:
model = tree.DecisionTreeClassifier()

In [153]:
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [154]:
y_predict = model.predict(X_test)

In [155]:
accuracy_score(y_test, y_predict)

0.70337411939191696

In [158]:
#look at true and false predictions​pd.DataFrame(
pd.DataFrame(
    confusion_matrix(y_test, y_predict),
    columns=['carat', 'cut', 'color', 'clarity','depth','table','price'],
    index=['carat', 'cut', 'color', 'clarity','depth','table','price'],
)

Unnamed: 0,carat,cut,color,clarity,depth,table,price
carat,1314,409,80,5,0,42,0
cut,363,1741,442,28,14,0,2
color,103,455,3152,234,91,1,25
clarity,2,34,255,617,119,0,41
depth,1,11,80,131,443,0,54
table,34,8,0,0,0,108,0
price,0,3,26,42,65,0,213
