In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
from csvLoader import get_data
from Heuristics import get_labels
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from matplotlib.ticker import FormatStrFormatter
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RandomizedSearchCV

# Few common functions

In [36]:
def get_grid():
    param_grid = {
    'max_depth': [3, 5,10,20],
    'min_samples_leaf': [5, 10,30,50,100, 150],
    'min_samples_split': [10,20,30,50,100],
    'n_estimators': [10,50,100,150,200,300],
    #[int(x) for x in np.linspace(start=2, stop=20, num=1)],
    'max_features': ['auto', 'sqrt'],
    'bootstrap': [True, False]
}
    rf = RandomForestClassifier(n_jobs=-1)

    grid_rf = RandomizedSearchCV(rf,
                         param_grid,
                          verbose=1,
                           n_jobs=-1,
                            cv=3,
                            n_iter=300,
                            random_state=11)

    return grid_rf

def evaluate(rfS, X_Train, X_Test,y_Train, y_Test):
    testPreds = rfS.predict(X_Test)
    trainPreds = rfS.predict(X_Train)
    
    classReportTest = classification_report(y_Test,testPreds, output_dict=True, target_names= ["face", "non-face"])
    classReportTrain = classification_report(y_Train,trainPreds, output_dict=True, target_names= ["face", "non-face"])
    
    return classReportTest,classReportTrain
    

def _filter_cols( type=[["ag"],["nr"]]):
    filterCols = list()

    cols = get_labels()    
    abbrev = {"a":"accel","g":"gyro", "r":"r", "n":"n"}
    typeCriteria = [[abbrev[tC] for tC in list(t[0])] for t in type]

    # check and add resultant first
    for sensor in typeCriteria[0]:
        for axis in typeCriteria[1]:
            if axis == "r":
                filterCols.extend([c for c in cols if sensor in c.split("_") and axis in c.split("_")])
            else:
                filterCols.extend([c for c in cols if sensor in c.split("_") and "r" not in c.split("_")])
    return filterCols

def get_train_test_data(csvCode = "50_8", type=[["ag"],["rn"]]):

    csvPATH =   "../data/dataset/statFeatures/stat{}.csv".format(csvCode)
    data = get_data(PATH=csvPATH)
    features, pids, descriptions, labels = data.features, data.pids, data.descriptions, data.labels
    
    filterCols = _filter_cols(type=type)
    features = pd.DataFrame(features, columns = filterCols)
    #features = features[filterCols]

    X_Train, X_Test, y_Train, y_Test = train_test_split(features.values,
                                    np.array(list(zip(pids,descriptions,labels))),
                                    test_size=0.2,
                                    shuffle=True,
                                    random_state=11)

    return filterCols, X_Train, X_Test, y_Train, y_Test


def get_rcParams(plt):
    plt.rcParams['font.size'] = 12

    plt.rcParams['hatch.linewidth'] = 0.25

    plt.rcParams['font.family'] = 'serif'
    plt.rcParams['font.serif'] = 'Times New Roman'
    plt.rcParams['figure.dpi'] = 600
    plt.rcParams["lines.linewidth"] = 1
    plt.rcParams['hatch.linewidth'] = 0.15

    return plt

In [31]:
a,b,c,d,e = get_train_test_data(csvCode="20_8", type=[["ag"],["rn"]])

############################################################
../data/dataset/statFeatures/stat20_8.csv
Data loaded from ../data/dataset/statFeatures/stat20_8.csv


In [32]:
colNames, X_Train, X_Test, y_Train,y_Test = get_train_test_data(csvCode="80_8", type = [["ag"],["nr"]])


############################################################
../data/dataset/statFeatures/stat80_8.csv
Data loaded from ../data/dataset/statFeatures/stat80_8.csv


In [33]:
np.take(y_Train,2,axis=-1)

array(['0', '0', '1', ..., '1', '1', '0'], dtype='<U7')

# What is the effect of window size on non-polynomial features ?

In [34]:
testAccuracies = list()
trainAccuracies = list()
cvResults = list()
for csvCode in ["20_8","30_8","40_8","50_8","60_8","70_8","80_8"]:
    
    colNames, X_Train, X_Test, y_Train,y_Test = get_train_test_data(csvCode=csvCode, type = [["ag"],["nr"]])
    y_Trainl = np.take(y_Train,2,axis=-1)
    y_Testl = np.take(y_Test,2,axis=-1)

    grid_rf = get_grid()    
    
    #cvResult = cross_validate(rf, features, labels,cv=5)
    #cvResults.append(cvResult)
    grid_rf.fit(X_Train, y_Trainl)
    rfBest = grid_rf.best_estimator_

    classReportTest,classReportTrain =evaluate(rfBest, X_Train,X_Test, y_Trainl, y_Testl)

    testAccuracies.append(classReportTest["accuracy"])
    trainAccuracies.append(classReportTrain["accuracy"])

############################################################
../data/dataset/statFeatures/stat20_8.csv
Data loaded from ../data/dataset/statFeatures/stat20_8.csv
Fitting 3 folds for each of 300 candidates, totalling 900 fits
############################################################
../data/dataset/statFeatures/stat30_8.csv
Data loaded from ../data/dataset/statFeatures/stat30_8.csv
Fitting 3 folds for each of 300 candidates, totalling 900 fits
############################################################
../data/dataset/statFeatures/stat40_8.csv
Data loaded from ../data/dataset/statFeatures/stat40_8.csv
Fitting 3 folds for each of 300 candidates, totalling 900 fits
############################################################
../data/dataset/statFeatures/stat50_8.csv
Data loaded from ../data/dataset/statFeatures/stat50_8.csv
Fitting 3 folds for each of 300 candidates, totalling 900 fits
############################################################
../data/dataset/statFeatures/stat60_8.c

In [35]:
testAccuracies

[0.8596491228070176,
 0.8912280701754386,
 0.9122807017543859,
 0.8957845433255269,
 0.9063231850117096,
 0.9249706916764361,
 0.9025821596244131]

In [None]:
print("Test accuracy for non-poly features is {} ".format(accuracy_score(y_Test, testPreds)*100))
print("Train accuracy for non-poly features is {} ".format(accuracy_score(y_Train, trainPreds)*100))

In [None]:
rfBest = grid_rf.best_estimator_
rfBest.fit(X_Train[::,list(impDfSelect10.index)], y_Train)
testPreds = rfBest.predict(X_TestPoly[::,list(impDfSelect10.index)])
trainPreds = rfBest.predict(X_TrainPoly[::,list(impDfSelect10.index)])
print(classification_report(y_Test, testPreds))
print("Test accuracy for top 10 poly features is {} ".format(accuracy_score(y_Test, testPreds)*100))
print("Train accuracy for top 10 poly features is {} ".format(accuracy_score(y_Train, trainPreds)*100))

In [None]:
cols = get_labels()
impDataSelect = np.array(list(zip(rfBestNorm.feature_importances_, cols)))

impDfSelect = pd.DataFrame(impDataSelect, columns = ["val","feat"]).sort_values(by="val", ascending=False)

impDfSelect10 = impDfSelect.sort_values(by="val", ascending=False)[:10]

In [None]:
rfBestNorm.fit(X_Train[::,impDfSelect10.index],y_Train)
testPreds = rfBestNorm.predict(X_Test[::,list(impDfSelect10.index)])
trainPreds = rfBestNorm.predict(X_Train[::,list(impDfSelect10.index)])

print("Test accuracy for top 10 non-poly features is {} ".format(accuracy_score(y_Test, testPreds)*100))
print("Train accuracy for top 10 non-poly features is {} ".format(accuracy_score(y_Train, trainPreds)*100))

In [None]:
scores = list()
for selectItems in np.array(impDfSelectNorm.index).reshape(11,8):
    rfNorm.fit(X_Train[::,selectItems],y_Train)
    scores.append(accuracy_score(y_Test, rfNorm.predict(X_Test[::,selectItems])))

In [None]:
[np.mean(r["train_score"]) for r in cvResults]

In [None]:


#fig, axs = plt.subplots(1,2, figsize = (4,2), dpi = 600)

x = ["0.2", "0.3", "0.4", "0.5", "0.6","0.7"]
plt.plot(x,np.array(testAccuracies)*100, color = "tab:blue")
plt.scatter(x,np.array(testAccuracies)*100)

#plt.plot(x,testAccuracies, label="test")
plt.xlabel("Time in seconds")
plt.ylabel("Accuracy (%)")
plt.yticks([80,82,84,86,88,90],labels=["80%","82%","84%","86%","88%","90%"])
#plt.plot(x,trainAccuracies, label = "train")
#plt.legend()
plt.savefig("windowSelect.png")

In [None]:
poly  = PolynomialFeatures(2)
poly = poly.fit(X_Train)

X_TrainPoly = poly.transform(X_Train)
X_TestPoly = poly.transform(X_Test)

rf = RandomForestClassifier(n_jobs=-1, verbose=0)
param_grid = {
    'max_depth': [3, 5,10,20],
    'min_samples_leaf': [5, 10,30,50,100, 150],
    'min_samples_split': [10,20,30,50,100],
    'n_estimators': [10,50,100,150,200,300],
    #[int(x) for x in np.linspace(start=2, stop=20, num=1)],
    'max_features': ['auto', 'sqrt'],
    'bootstrap': [True, False]
}

# param_grid['pca__n_components'] = hyper['pca__n_components']
#pipe = Pipeline([('poly',poly), ('rf', rf)])

grid_rf = RandomizedSearchCV(rf,
                         param_grid,
                          verbose=1,
                           n_jobs=-1,
                            cv=3,
                            n_iter=30)
grid_rf.fit(X_TrainPoly, y_Train)
print(grid_rf.best_params_)

In [None]:
rfBest = grid_rf.best_estimator_
rfBest.fit(X_TrainPoly,y_Train)
testPreds = rfBest.predict(X_TestPoly)
print(classification_report(y_Test, testPreds))
print("Test accuracy for poly features is {} ".format(accuracy_score(y_Test, testPreds)*100))
print("Train accuracy for poly features is {} ".format(accuracy_score(y_Train, rfBest.predict(X_TrainPoly))*100))

In [None]:
from sklearn.metrics import plot_roc_curve
plot_roc_curve(rfBest, X_TestPoly, y_Test)
#plot_roc_curve(rfBestNorm,X_Test,y_Test)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.savefig("rocnonpoly.png")

In [None]:
cols = get_labels()
impDataSelect = np.array(list(zip(rfBest.feature_importances_, 
                        poly.get_feature_names(cols))))

impDfSelect = pd.DataFrame(impDataSelect, columns = ["val","feat"]).sort_values(by="val", ascending=False)

impDfSelect10 = impDfSelect.sort_values(by="val", ascending=False)[:10]

# What is the accuracy when rf is fit with top 10 polynomial features

In [None]:
rfBest = grid_rf.best_estimator_
rfBest.fit(X_TrainPoly[::,list(impDfSelect10.index)], y_Train)
testPreds = rfBest.predict(X_TestPoly[::,list(impDfSelect10.index)])
trainPreds = rfBest.predict(X_TrainPoly[::,list(impDfSelect10.index)])
print(classification_report(y_Test, testPreds))
print("Test accuracy for top 10 poly features is {} ".format(accuracy_score(y_Test, testPreds)*100))
print("Train accuracy for top 10 poly features is {} ".format(accuracy_score(y_Train, trainPreds)*100))

In [None]:
import tqdm
scores = list()
featuresInFocus = list()

totalSpace = list(np.array(impDfSelect.index[:4000]).reshape(400,10))
totalSpace.append(impDfSelect.index[4000:4005])

for selectItems in tqdm.tqdm(totalSpace):
    featuresInFocus.extend(selectItems)
    rf.fit(X_TrainPoly[::,featuresInFocus],y_Train)
    scores.append(accuracy_score(y_Test, rf.predict(X_TestPoly[::,featuresInFocus])))

In [None]:
plt.rcParams['font.size'] = 12

plt.rcParams['hatch.linewidth'] = 0.25

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Times New Roman'
plt.rcParams['figure.dpi'] = 600
plt.rcParams["lines.linewidth"] = 1
plt.rcParams['hatch.linewidth'] = 0.15

#fig, axs = plt.subplots(1,2, figsize = (4,2), dpi = 600)

#x = ["0.2", "0.3", "0.4", "0.5", "0.6","0.7"]
cFeats = list(np.linspace(10,4000,400))
cFeats.append(4005)

ax = plt.subplot()
ax.plot(cFeats[0:], (np.array(scores)*100)[0:], color = "tab:blue")
#plt.scatter(cFeats,np.array(scores)*100)

#plt.plot(x,testAccuracies, label="test")
ax.set_xlabel("Features used")
ax.set_ylabel("Accuracy (%)")
#yl = [86,87,88,89,90,91,92]
#plt.yticks(yl,labels=[str(l) + "%" for l in yl])
#plt.plot(x,trainAccuracies, label = "train")
#plt.legend()
#ax.set_ylabel(ax.yaxis.majorTicks)
def format_func(value,ticknumber):
    return str(value) + "%"

ax.yaxis.set_major_formatter(plt.FuncFormatter(format_func))

plt.savefig("selectPoly10.png")

In [None]:
ax.yaxis.majorTicks[0].__dict__.keys()


In [None]:
rfSmall = RandomForestClassifier(max_depth = 10, n_estimators = 200)
rfSmall.fit(X_TrainPoly[::,impDfSelect10.index],y_Train)

In [None]:
rfSmall.score(X_TestPoly[::,impDfSelect10.index],y_Test)

In [None]:
# rf = RandomForestClassifier(n_estimators= 100, 
#                             min_samples_split= 10,
#                              min_samples_leaf= 5, 
#                              max_features= 'auto',
#                               max_depth= 10, 
#                               bootstrap= False)
# rf.fit(X_TrainPoly, y_Train)

In [None]:
plt.rcParams['font.size'] = 12

plt.rcParams['hatch.linewidth'] = 0.25

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = 'Times New Roman'
plt.rcParams['figure.dpi'] = 600
plt.rcParams["lines.linewidth"] = 1
plt.rcParams['hatch.linewidth'] = 0.15
# plt.rcParams.update({'font.size': 20})
plt.figure(figsize= (7.0,3))

ax = plt.subplot()
ax.barh(impDfSelect["feat"].values,impDfSelect["val"].values)
ax.xaxis.set_major_formatter(FormatStrFormatter('%.2f'))
plt.tight_layout()
plt.savefig("rfImportancePoly.png")

In [None]:
scores = list()
featuresInFocus = list()
for selectItems in np.array(impDfSelect.index).reshape(4005,100):
    featuresInFocus.extend(selectItems)
    rf.fit(X_TrainPoly[::,featuresInFocus],y_Train)
    scores.append(accuracy_score(y_Test, rf.predict(X_TestPoly[::,featuresInFocus])))

In [None]:
rfNorm = RandomForestClassifier(n_estimators= 100, 
                            min_samples_split= 10,
                             min_samples_leaf= 5, 
                             max_features= 'auto',
                              max_depth= 10, 
                              bootstrap= False)

In [None]:
rfNorm.fit(X_Train, y_Train)

In [None]:
print(classification_report(rf.predict(X_Test), y_Test))

In [None]:
impData = np.array(list(zip(rfNorm.feature_importances_, cols)))
impDfNorm = pd.DataFrame(impData, columns = ["val","feat"])
#impDfSelectNorm= impDfNorm.sort_values(by="val", ascending=False)[:10]
impDfSelectNorm= impDfNorm.sort_values(by="val", ascending=False)

In [None]:
scores = list()
for selectItems in np.array(impDfSelectNorm.index).reshape(11,8):
    rfNorm.fit(X_Train[::,selectItems],y_Train)
    scores.append(accuracy_score(y_Test, rfNorm.predict(X_Test[::,selectItems])))

In [None]:
plt.plot(scores)

In [None]:
ax = plt.subplot()
ax.barh(impDfSelect["feat"].values,impDfSelect["val"].values)
#.xticks(FormatStrFormatter('%.2f'))
ax.xaxis.set_major_formatter(FormatStrFormatter('%.2f'))

In [None]:
pipe['poly'].get_feature_names(cols)

In [None]:
from Heuristics import get_labels

In [None]:
cols = get_labels()