In [1]:
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

In [12]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing
import matplotlib
from scipy.stats import skew
from scipy.stats import boxcox
import seaborn as sns
sns.set_style('darkgrid')
matplotlib.rc('font', size=10)
matplotlib.rc('axes', titlesize=10)
matplotlib.rc('axes', labelsize=10)
matplotlib.rc('xtick', labelsize=10)
matplotlib.rc('ytick', labelsize=10)
matplotlib.rc('legend', fontsize=10)
matplotlib.rc('figure', titlesize=10)


from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df = pd.read_csv("data/glass.csv")
df.shape

(214, 10)

In [None]:
# Detect observations with more than one outlier
from collections import Counter
def outlier_hunt(df):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than 2 outliers. 
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in df.columns.tolist():
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        
        # Interquartile rrange (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > 2 )
    
    return multiple_outliers   

In [None]:
features = df.columns[:-1].tolist()
print('The dataset contains %d observations with more than 2 outliers' %(len(outlier_hunt(df[features]))))

In [None]:
outlier_indices = outlier_hunt(df[features])
df = df.drop(outlier_indices).reset_index(drop=True)
print(df.shape)

In [4]:
X = df.drop("Type", axis = 1)
y = df["Type"]

In [5]:
X.shape

(214, 9)

In [6]:
y.shape

(214,)

In [7]:
X.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0


### Classify using unskewed data

In [None]:
classes = X.columns.values
X_u = pd.DataFrame()
for c in classes:
    scaled = preprocessing.scale(X[c]) 
    boxcox_scaled = preprocessing.scale(boxcox(X[c] + np.max(np.abs(X[c]) +1))[0])
    X_u[c] = boxcox_scaled
    skness = skew(scaled)
    boxcox_skness = skew(boxcox_scaled) 
    figure = plt.figure()
    
    figure.add_subplot(121)
    plt.hist(scaled,facecolor='blue',alpha=0.5) 
    plt.xlabel(c + " - Transformed") 
    plt.title("Skewness: {0:.2f}".format(skness)) 
    
    figure.add_subplot(122) 
    plt.hist(boxcox_scaled,facecolor='red',alpha=0.5) 
    plt.title("Skewness: {0:.2f}".format(boxcox_skness)) 
    plt.show()

In [None]:
X_u_train, X_u_test, y_train, y_test = train_test_split(X_u,y, test_size=0.25, random_state=42)
rf = RandomForestClassifier(max_features='auto',n_jobs=-1, random_state=1)
params = { "criterion" : ["gini", "entropy"]
              , "min_samples_leaf" : [1, 5, 10]
              , "min_samples_split" : [2, 4, 10, 12, 16]
              , "n_estimators": [100, 125, 200]
         }
GS = GridSearchCV(estimator=rf, param_grid=params, cv=5,n_jobs=-1)
GS= GS.fit(X_u_train,y_train)
print(GS.best_score_)
print(GS.best_params_)

In [None]:
rf = RandomForestClassifier(criterion='entropy', n_estimators=100, 
                            min_samples_leaf=4, min_samples_split=2, random_state=1,n_jobs=-1)
rf.fit(X_u_train,y_train)
pred = rf.predict(X_u_test)
rf.score(X_u_test,y_test)

In [None]:
rf = RandomForestClassifier(criterion='gini', n_estimators=100, 
                            min_samples_leaf=1, min_samples_split=2, random_state=1,n_jobs=-1)
rf.fit(X_u_train,y_train)
pred = rf.predict(X_u_test)
rf.score(X_u_test,y_test)

### Classify using skewed data

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)
rf = RandomForestClassifier(max_features='auto',n_jobs=-1, random_state=1)
params = { "criterion" : ["gini", "entropy"]
              , "min_samples_leaf" : [1, 5, 10]
              , "min_samples_split" : [2, 4, 10, 12, 16]
              , "n_estimators": [100, 125, 200]
         }
GS = GridSearchCV(estimator=rf, param_grid=params, cv=5,n_jobs=-1)
GS= GS.fit(X_train,y_train)
print(GS.best_score_)
print(GS.best_params_)

0.76875
{'criterion': 'entropy', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 125}


In [9]:
rf = RandomForestClassifier(criterion='entropy', n_estimators=120, 
                            min_samples_leaf=1, min_samples_split=4, random_state=1,n_jobs=-1)
rf.fit(X_train,y_train)
pred = rf.predict(X_test)
rf.score(X_test,y_test)

0.7777777777777778

In [10]:
rf = RandomForestClassifier(criterion='gini', n_estimators=120, 
                            min_samples_leaf=1, min_samples_split=4, random_state=1,n_jobs=-1)
rf.fit(X_train,y_train)
pred = rf.predict(X_test)
rf.score(X_test,y_test)

0.7962962962962963

In [14]:
y_pred = rf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[14  0  0  0  0  0]
 [ 6 13  0  1  1  0]
 [ 2  0  2  0  0  0]
 [ 0  1  0  3  0  0]
 [ 0  0  0  0  3  0]
 [ 0  0  0  0  0  8]]
0.7962962962962963


In [15]:
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          1       0.64      1.00      0.78        14
          2       0.93      0.62      0.74        21
          3       1.00      0.50      0.67         4
          5       0.75      0.75      0.75         4
          6       0.75      1.00      0.86         3
          7       1.00      1.00      1.00         8

avg / total       0.85      0.80      0.79        54



In [17]:
print('The accuracy of the RF classifier is {:.4f} on training data'.format(rf.score(X_train, y_train)))
print('The accuracy of the RF classifier is {:.4f} on test data'.format(rf.score(X_test, y_test)))

The accuracy of the RF classifier is 1.0000 on training data
The accuracy of the RF classifier is 0.7963 on test data


____

## Skewed Data - StandardScaler

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_sc = sc.fit_transform(X)

In [None]:
X_sc

In [None]:
X_sc_train, X_sc_test, y_train, y_test = train_test_split(X_sc, y, test_size = 0.25, random_state=42)
X_sc_train.shape, y_train.shape, X_sc_test.shape, y_test.shape

In [None]:
clf = RandomForestClassifier()
clf.fit(X_sc_train, y_train)
predict=clf.predict(X_sc_test)

In [None]:
print(confusion_matrix(y_test,predict))
print(accuracy_score(y_test,predict))