# Binary Classification

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Reading csv files 

In [None]:
# Imports the csv file which contains the data of the training and testing photos
data_train = pd.read_csv("/kaggle/input/brighton-a-memorable-city/training.csv")
data_test = pd.read_csv("/kaggle/input/brighton-a-memorable-city/testing.csv")
confidence = pd.read_csv("/kaggle/input/brighton-a-memorable-city/annotation_confidence.csv")
proportion = pd.read_csv("/kaggle/input/brighton-a-memorable-city/test_proportions.csv")
validSubmission = pd.read_csv("/kaggle/input/brighton-a-memorable-city/sample_valid_submission.csv")
additional_train = pd.read_csv("/kaggle/input/brighton-a-memorable-city/additional_training.csv")

# Importing all used libraries

In [None]:
#Used for the preprocessing and feature selection
from sklearn.model_selection import train_test_split # Using Skicit-learn to split data into training and testing sets
from sklearn.impute import SimpleImputer # Used when filling in missing details in the additional training data
from sklearn.feature_selection import chi2 # Feature selection using chi-squared statistics
from sklearn.feature_selection import SelectKBest # Feature selection using chi-squared statistics
from imblearn.under_sampling import RandomUnderSampler # Used to balance the training data 
from imblearn.over_sampling import RandomOverSampler # Used to balance the training data 
from collections import Counter # To see class distribution

# Classifier libraries
from sklearn.neural_network import MLPClassifier #Multi-layer-perceptron classifier
from sklearn.ensemble import RandomForestClassifier # Random-forest classifier
from sklearn.linear_model import LogisticRegression # Logistic-regression classifier
from sklearn import neighbors # kNN classifier
from sklearn.svm import SVC # Support vector classifier
from sklearn.linear_model import SGDClassifier # Supervised gradient descent classifier

from sklearn.metrics import accuracy_score # to find the accuracy of the model

#Used to download the csv file onto machine
from IPython.display import HTML
import pandas as pd
import numpy as np
import base64

# Creating the features and labels used

Retreving the additional data, only focusing on the CNN features (first 4097), estimating the missing values with the mean of the column. Retrieving the labelled data, only focusing on the CNN features (first 4097), and concatenating them together.

In [None]:
# Seperating the additional training data into CNN and GIST(not using the GIST features)
addfeatures = additional_train.iloc[:, :-1].values
addfeatures = addfeatures[:, 0:4097]

# Seperating the labels of the additional data
addLabels = additional_train.iloc[:, -1].values
addLabels = addLabels.reshape(-1,1)

# Creating an imputer to estimate the missing values using the mean of each column
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
addfeatures = imp.fit_transform(addfeatures)
addLabels = imp.fit_transform(addLabels)
addLabels = addLabels.reshape(-1)

In [None]:
#Get the fully labelled data from data_train, split into features and labels
features = data_train.iloc[:, :-1].values
labels = data_train.iloc[:, -1].values

#Only focusing on the CNN features
features = features[:,0:4097]

# Add the additional and training data into 2 lists, one for features, one for labels
features = np.concatenate((features, addfeatures))
labels = np.concatenate((labels, addLabels))

# Balance the class distribution

Use both oversampling and undersampling to balance the class distribution

In [None]:
print('Old class distriution:', Counter(labels)) 

# oversampling strategy
over = RandomOverSampler(sampling_strategy=0.2)
# fit and apply the transform
X, y = over.fit_resample(features, labels)

# undersampling strategy
under = RandomUnderSampler(sampling_strategy=0.8)
# fit and apply the transform
features, labels = under.fit_resample(X, y)

print('New class distriution:', Counter(labels)) 

# Filter Method - Chi-squared

Using the filter to reduce the high dimentionality

In [None]:
chi = SelectKBest(chi2, k=1000) # Initialises the method, sets k to 1000
chi.fit(features, labels) # fits to the training features 
features = chi.transform(features) # Selects features accoring to the 1000 highest scores

# Model selection - Train-test split

Used to evaluate the classifiers

In [None]:
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 0)

print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

# Evaluate each classifier

Trains and tests each classifier to evaluate the accuracy

In [None]:
models = [
    ('MLP', MLPClassifier()),
    ('RF', RandomForestClassifier()),
    ('LR', LogisticRegression()),
    ('kNN', neighbors.KNeighborsClassifier()),
    ('SV', SVC()),
    ('SGD', SGDClassifier()),]

In [None]:
#Over sampling
predictions = []
for name, model in models:
    clf = model
    clf.fit(train_features, train_labels)
    predictions.append(clf.predict(test_features)) 
    accuracy = clf.score(test_features, test_labels)
    print(name, accuracy)

# Testing
Transforms the testing features so they are the same as the training features and retrieves the predictions from the model

In [None]:
test = data_test.values # Retrieves the features from the test data
test = test[:, 0:4097] # Only focuses on the CNN featues
test = chi.transform(test) # Perform the chi filter method on the features that has been fitted above

**Parameter tune the chosen classifier**

Using randomisedSearchCV to retrieve the hyperparmeters:

> max_depth = [int(x) for x in np.linspace(10, 110, num = 20)]
max_depth.append(None)
random_grid = {'n_estimators': [int(x) for x in np.linspace(start = 50, stop = 550, num = 100)],
               'max_features': ['auto', 'sqrt'],
               'max_depth': max_depth,
               'min_samples_split': [2, 5],
               'min_samples_leaf': [1, 2, 4]}
rf_random = RandomizedSearchCV(RandomForestClassifier(), random_grid, cv = 5, n_jobs = -1)
rf_random.fit(train_features, train_labels)
print(rf_random.score(test_features, test_labels))
print(rf_random.best_params_)

The best parameters are used to initialise the classifier then train it on the features and labels

In [None]:
clf = RandomForestClassifier(n_estimators=337, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', max_depth=104)
clf.fit(features, labels) 
testPre = clf.predict(test) #Predict the test data labels

Turn the predictions into a valid submission

In [None]:
predics = []
for i in testPre:
    predics.append(int(i))
df = pd.DataFrame(data={'ID':np.arange(1,int(len(testPre)+1)), 'prediction': predics}) 

Create a link so the csv file can be downloaded to get a csv file of the predictions

In [None]:
def create_download_link(df, title = "Download CSV file", filename = "predictions.csv"):  
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

create_download_link(df)