In [3]:
import numpy as np
import pandas as pd
import ggplot
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [5]:
# load and view the diamonds data
dfDiamonds = ggplot.diamonds
dfDiamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58,335,4.34,4.35,2.75


In [6]:
# plot a histogram with a line marking $12,000
plt.figure()
plt.hist(dfDiamonds['price'])
plt.axvline(x=12000)
plt.show()

In [7]:
dfDiamonds['Expensive'] = 0
dfDiamonds.ix[dfDiamonds['price'] >= 12000, 'Expensive'] = 1
dfDiamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,Expensive
0,0.23,Ideal,E,SI2,61.5,55,326,3.95,3.98,2.43,0
1,0.21,Premium,E,SI1,59.8,61,326,3.89,3.84,2.31,0
2,0.23,Good,E,VS1,56.9,65,327,4.05,4.07,2.31,0
3,0.29,Premium,I,VS2,62.4,58,334,4.2,4.23,2.63,0
4,0.31,Good,J,SI2,63.3,58,335,4.34,4.35,2.75,0


In [8]:
dfDiamonds.drop(['price'], axis=1, inplace = True)

In [9]:
dfDiamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,Expensive
0,0.23,Ideal,E,SI2,61.5,55,3.95,3.98,2.43,0
1,0.21,Premium,E,SI1,59.8,61,3.89,3.84,2.31,0
2,0.23,Good,E,VS1,56.9,65,4.05,4.07,2.31,0
3,0.29,Premium,I,VS2,62.4,58,4.2,4.23,2.63,0
4,0.31,Good,J,SI2,63.3,58,4.34,4.35,2.75,0


In [10]:
dfCutDummies = pd.get_dummies(dfDiamonds.cut)
dfColorDummies = pd.get_dummies(dfDiamonds.color)
dfClarityDummies = pd.get_dummies(dfDiamonds.clarity)

In [12]:
cols = dfDiamonds.columns.tolist()
cols = cols[-1:] + cols[:-1]
dfDiamonds = dfDiamonds[cols]
dfDiamonds.drop(['cut', 'color', 'clarity'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [13]:
dfDiamonds = pd.merge(dfDiamonds, dfCutDummies, left_index=True, right_index=True)
dfDiamonds = pd.merge(dfDiamonds, dfColorDummies, left_index=True, right_index=True)
dfDiamonds = pd.merge(dfDiamonds, dfClarityDummies, left_index=True, right_index=True)

In [14]:
dfDiamonds['is_train'] = np.random.uniform(0, 1, len(dfDiamonds)) <= .75
train, test = dfDiamonds[dfDiamonds['is_train']==True], dfDiamonds[dfDiamonds['is_train']==False]

In [15]:
features = dfDiamonds.columns[1:]
clf = RandomForestClassifier(n_jobs=2)
y, _ = pd.factorize(train['Expensive'])
clf.fit(train[features], y)

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
            min_samples_split=2, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=None, verbose=0)

In [20]:
expected = y
predicted = clf.predict(train[features])

In [21]:
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     37882
          1       1.00      0.99      0.99      2594

avg / total       1.00      1.00      1.00     40476

[[37878     4]
 [   28  2566]]
