## Code to add new features
Choose the classifier which gave very good accuracies in the previous cross-validation stage.
Train that classifier on Set P and test it on Set Q
Improve recall using false positives

In [283]:
import arff, numpy as np
# Load Train dataset
dataset = arff.load(open('../datasets/features_I/features_I.arff'))
train = np.array(dataset['data'])
#Load Test dataset
dataset = arff.load(open('../datasets/features_J/features_J.arff'))
test = np.array(dataset['data'])

In [284]:
# Number of features
length = len(train[0]);
#Feature vector without last 3 i.e. dish name, file path and class label for train data
train_input = train[:,0:(length-3)]
# Class labels for train dataset
train_labels = train[:,(length-1)] # this is the class column

#Feature vector without last 3 i.e. dish name, file path and class label for test data
test_input = test[:,0:(length-3)]
# Class labels for test dataset
test_labels = test[:,(length-1)] # this is the class column

In [285]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
lb = preprocessing.LabelBinarizer()
# Create merged labels of train and test to fit binarizer
merged_labels =[]
merged_labels.extend(train_labels)
merged_labels.extend(test_labels)
lb.fit(merged_labels)
train_labels = np.array([number[0] for number in lb.transform(train_labels)])
test_labels = np.array([number[0] for number in lb.transform(test_labels)])

for i in range(length-3):
    # Create merged dataset of train and test to fit encoder as scikit works for numeric type only
    merged_data = []
    merged_data.extend(train[:,i])
    merged_data.extend(test[:,i])
    le.fit(merged_data)
    # Create transformed dataset of train and test
    train_input[:,i] = le.transform(train[:,i])
    test_input[:,i] = le.transform(test[:,i])

In [286]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

name = "ADA Boost"
clf = AdaBoostClassifier()
# Create merged input to create a scalar to normalize data
input = []
input.extend(train_input)
input.extend(test_input)

scaler = StandardScaler()
scaler.fit(input)

#Transform train and test data
train_input = scaler.transform(train_input)
test_input = scaler.transform(test_input)

#Train classifier on training set
clf.fit(train_input, train_labels)

#Get Predicted labels for test set
predicted_labels = clf.predict(test_input)

#Print report i.e. precision, recall, F1 for different classes
print(name)
print(classification_report(test_labels, predicted_labels, target_names = ['negative','positive']))

# Print mis-classified labels
print("Predicted Label: Negative Actual Label: Positive")
for i in range(len(test_labels)):
    print
    if predicted_labels[i] != test_labels[i] and predicted_labels[i] == 0:
        print(test[i])

print("Predicted Label: Positive Actual Label: Negative")
for i in range(len(test_labels)):
    if predicted_labels[i] != test_labels[i] and predicted_labels[i] == 1:
        print(test[i])



ADA Boost
             precision    recall  f1-score   support

   negative       0.87      0.98      0.92      2400
   positive       0.92      0.65      0.76      1015

avg / total       0.89      0.88      0.87      3415

Predicted Label: Negative Actual Label: Positive
['0' '0' '0' '0' '1' '2' '0' '0' '0' '0' '0' '30' '0' '0' '0' '0' '0'
 'houseground balls' 'datasets/Set_J_TEST/review-079.txt' 'positive']
['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 'salmon' 'datasets/Set_J_TEST/review-079.txt' 'positive']
['0' '0' '0' '0' '1' '92' '0' '0' '0' '0' '0' '141' '0' '0' '0' '0' '0'
 'polenta' 'datasets/Set_J_TEST/review-079.txt' 'positive']
['0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 'baby arugula and apple' 'datasets/Set_J_TEST/review-079.txt' 'positive']
['0' '0' '1' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '0' '0' '0' '0'
 'doublebarreled ravioli' 'datasets/Set_J_TEST/review-082.txt' 'positive']
['0' '0' '0' '0' '0' '0' '0' '0' '0' '