# GIK2FB - Labb 2 - Uppgift 2

## Data set:

[Wireless Indoor Localization Data Set](https://archive.ics.uci.edu/ml/datasets/Wireless+Indoor+Localization)

## Abstract

Collected in indoor space by observing signal strengths of seven WiFi signals visible on a smartphone. The decision variable is one of the four rooms.

## Data Set Information:

Collected to perform experimentation on how wifi signal strengths can be used to determine one of the indoor locations.

## Attribute Information:

Each attribute is wifi signal strength observed on smartphone.

## Credits

1. Rajen Bhatt, 'Fuzzy-Rough Approaches for Pattern Classification: Hybrid measures, Mathematical analysis, Feature selection algorithms, Decision tree algorithms, Neural learning, and Applications', Amazon Books 
2. Jayant G Rohra, Boominathan Perumal, Swathi Jamjala Narayanan, Priya Thakur, and Rajen B Bhatt, 'User Localization in an Indoor Environment Using Fuzzy Hybrid of Particle Swarm Optimization & Gravitational Search Algorithm with Neural Networks', in Proceedings of Sixth International Conference on Soft Computing for Problem Solving,2017, pp. 286-295.

# Import the data set and create dataframes

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Create variables for the columns so that we can easily switch between different data sets with minimal changes
columns = ['Agent 1', 'Agent 2', 'Agent 3', 'Agent 4', 'Agent 5', 'Agent 6', 'Agent 7', 'Location']
attributes = columns[0:7]
target = columns[7]

# Import data set and create a data frames
dataframe = pd.read_csv('wifi_localization.txt', delim_whitespace=True, header=None, usecols=[*range(0, 8)], names=columns)
data = pd.DataFrame.copy(dataframe)
data_scaled = pd.DataFrame.copy(dataframe)

# Create a scaler object
scaler = MinMaxScaler()

# Normalize data in data_scaled
data_scaled[attributes] = scaler.fit_transform(data_scaled[attributes])

# Convert the array back to a dataframe
data_scaled = pd.DataFrame(data_scaled)

# Do some exploratory data analysis

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

print('Orginal data:')
display(round(data.describe(),1))

print('Scaled data:')
display(round(data_scaled.describe(),1))

ax1 = sns.pairplot(data, hue = target)
ax1.fig.suptitle("Original data", y=1.02)
ax2 = sns.pairplot(data_scaled, hue = target)
ax2.fig.suptitle("Normalized data", y=1.02)

## Single run with original data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Split data into training and test sets
x = data.drop(target, axis = 1)
y = data[target]
x_training_data, x_test_data, y_training_data, y_test_data = train_test_split(x, y, test_size = 0.25)

# Train the decision tree model
model = DecisionTreeClassifier(random_state=None)
model.fit(x_training_data, y_training_data)

# Predict the test set results
predictions = model.predict(x_test_data)
predictions

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import tree

# Generate a decision tree
tree.plot_tree(model)
plt.show()

# Print the classification report
print('\nClassification report:\n', classification_report(y_test_data, predictions))

# Print the Confusion matrix
print('\nConfustion matrix:\n', confusion_matrix(y_test_data, predictions))
#pd.crosstab(y_test_data, predictions, rownames=['Actual'], colnames=['Predicted'])

## Single run with normalized data

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Split data into training and test sets
x = data.drop(target, axis = 1)
y = data[target]
x_training_data, x_test_data, y_training_data, y_test_data = train_test_split(x, y, test_size = 0.25)

# Train the decision tree model
model = DecisionTreeClassifier(random_state=None)
model.fit(x_training_data, y_training_data)

# Predict the test set results
predictions = model.predict(x_test_data)
predictions

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import tree

# Generate a decision tree
tree.plot_tree(model)
plt.show()

# Print the classification report
print('\nClassification report:\n', classification_report(y_test_data, predictions))

# Print the Confusion matrix
print('\nConfustion matrix:\n', confusion_matrix(y_test_data, predictions))
#pd.crosstab(y_test_data, predictions, rownames=['Actual'], colnames=['Predicted'])

## 100 runs with original data and 75/25 split for training/test

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import tree

# Create arrays to store the results
accuracy_log = []
confusion_matrix_log = []


# Do 100 runs
for i in range(100):
    # Split data into training and test sets
    x = data.drop(target, axis = 1)
    y = data[target]
    x_training_data, x_test_data, y_training_data, y_test_data = train_test_split(x, y, test_size = 0.25)

    # Train the decision tree model
    model = DecisionTreeClassifier(random_state=None)
    model.fit(x_training_data, y_training_data)

    # Predict the test set results
    predictions = model.predict(x_test_data)
    
    # Get the classification report
    report = classification_report(y_test_data, predictions, output_dict=True, digits=6)

    # Save the accuracy for each run
    accuracy_log.insert(i, report['accuracy'])

    # Save the confusion matrix for each run
    confusion_matrix_log.insert(i, confusion_matrix(y_test_data, predictions))


# Get index of the highest and second lowest accuracy
max_accuracy_index = accuracy_log.index(max(accuracy_log))
min_accuracy_index = accuracy_log.index(min(accuracy_log))

# Print the highest accuracy and the corresponding confusion matrix for that run
print('---Best run---\nAccuracy:\n', accuracy_log[max_accuracy_index],'\n')
print('Confusion matrix:\n', confusion_matrix_log[max_accuracy_index],'\n\n')

# Print the lowest accuracy and the corresponding confusion matrix for that run
print('---Worst run---\nAccuracy:\n', accuracy_log[min_accuracy_index],'\n')
print('Confusion matrix:\n', confusion_matrix_log[min_accuracy_index],'\n\n')

## 100 runs with original data and 5/95 split for training/test

Making minor changes in the split didn't affect the results by a lot so let's take it to the extreme and give the model a very small cut for training. I believe this might, in part, be due to to the large data set compared to the other recommended options.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import tree

# Create arrays to store the results
accuracy_log = []
confusion_matrix_log = []


# Do 100 runs
for i in range(100):
    # Split data into training and test sets
    x = data.drop(target, axis = 1)
    y = data[target]
    x_training_data, x_test_data, y_training_data, y_test_data = train_test_split(x, y, test_size = 0.95)

    # Train the decision tree model
    model = DecisionTreeClassifier(random_state=None)
    model.fit(x_training_data, y_training_data)

    # Predict the test set results
    predictions = model.predict(x_test_data)
    
    # Get the classification report
    report = classification_report(y_test_data, predictions, output_dict=True, digits=6)

    # Save the accuracy for each run
    accuracy_log.insert(i, report['accuracy'])

    # Save the confusion matrix for each run
    confusion_matrix_log.insert(i, confusion_matrix(y_test_data, predictions))


# Get index of the highest and second lowest accuracy
max_accuracy_index = accuracy_log.index(max(accuracy_log))
min_accuracy_index = accuracy_log.index(min(accuracy_log))

# Print the highest accuracy and the corresponding confusion matrix for that run
print('---Best run---\nAccuracy:\n', accuracy_log[max_accuracy_index],'\n')
print('Confusion matrix:\n', confusion_matrix_log[max_accuracy_index],'\n\n')

# Print the lowest accuracy and the corresponding confusion matrix for that run
print('---Worst run---\nAccuracy:\n', accuracy_log[min_accuracy_index],'\n')
print('Confusion matrix:\n', confusion_matrix_log[min_accuracy_index],'\n\n')

## 100 runs with original data and 5/95 split for training/test

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import tree

# Create arrays to store the results
accuracy_log = []
confusion_matrix_log = []


# Do 100 runs
for i in range(100):
    # Split data into training and test sets
    x = data_scaled.drop(target, axis = 1)
    y = data_scaled[target]
    x_training_data, x_test_data, y_training_data, y_test_data = train_test_split(x, y, test_size = 0.25)

    # Train the decision tree model
    model = DecisionTreeClassifier(random_state=None)
    model.fit(x_training_data, y_training_data)

    # Predict the test set results
    predictions = model.predict(x_test_data)
    
    # Get the classification report
    report = classification_report(y_test_data, predictions, output_dict=True, digits=6)

    # Save the accuracy for each run
    accuracy_log.insert(i, report['accuracy'])

    # Save the confusion matrix for each run
    confusion_matrix_log.insert(i, confusion_matrix(y_test_data, predictions))


# Get index of the highest and second lowest accuracy
max_accuracy_index = accuracy_log.index(max(accuracy_log))
min_accuracy_index = accuracy_log.index(min(accuracy_log))

# Print the highest accuracy and the corresponding confusion matrix for that run
print('---Best run---\nAccuracy:\n', accuracy_log[max_accuracy_index],'\n')
print('Confusion matrix:\n', confusion_matrix_log[max_accuracy_index],'\n\n')

# Print the lowest accuracy and the corresponding confusion matrix for that run
print('---Worst run---\nAccuracy:\n', accuracy_log[min_accuracy_index],'\n')
print('Confusion matrix:\n', confusion_matrix_log[min_accuracy_index],'\n\n')

## 100 runs with normalized data and 5/95 split for training/test

Making minor changes in the split didn't affect the results by a lot so let's take it to the extreme and give the model a very small cut for training. I believe this might be due to to the large data set compared to the other recommended options.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn import tree

# Create arrays to store the results
accuracy_log = []
confusion_matrix_log = []


# Do 100 runs
for i in range(100):
    # Split data into training and test sets
    x = data_scaled.drop(target, axis = 1)
    y = data_scaled[target]
    x_training_data, x_test_data, y_training_data, y_test_data = train_test_split(x, y, test_size = 0.95)

    # Train the decision tree model
    model = DecisionTreeClassifier(random_state=None)
    model.fit(x_training_data, y_training_data)

    # Predict the test set results
    predictions = model.predict(x_test_data)
    
    # Get the classification report
    report = classification_report(y_test_data, predictions, output_dict=True, digits=6)

    # Save the accuracy for each run
    accuracy_log.insert(i, report['accuracy'])

    # Save the confusion matrix for each run
    confusion_matrix_log.insert(i, confusion_matrix(y_test_data, predictions))


# Get index of the highest and second lowest accuracy
max_accuracy_index = accuracy_log.index(max(accuracy_log))
min_accuracy_index = accuracy_log.index(min(accuracy_log))

# Print the highest accuracy and the corresponding confusion matrix for that run
print('---Best run---\nAccuracy:\n', accuracy_log[max_accuracy_index],'\n')
print('Confusion matrix:\n', confusion_matrix_log[max_accuracy_index],'\n\n')

# Print the lowest accuracy and the corresponding confusion matrix for that run
print('---Worst run---\nAccuracy:\n', accuracy_log[min_accuracy_index],'\n')
print('Confusion matrix:\n', confusion_matrix_log[min_accuracy_index],'\n\n')