In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#import statements
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import neighbors, tree
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

from sklearn.linear_model import SGDRegressor, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error

# Dataset Information

This data set measures the running time of a matrix-matrix product A Â· B = C, where all matrices have size 2048 x 2048, using a parameterizable SGEMM GPU kernel with 241600 possible parameter combinations. For each tested combination, 4 runs were performed and their results are reported as the 4 last columns. All times are measured in milliseconds.

The experiment was run on a desktop workstation running Ubuntu 16.04 Linux with an Intel Core i5 (3.5GHz), 16GB RAM, and a NVidia Geforce GTX 680 4GB GF580 GTX-1.5GB GPU.

#### Attribute Information:

Independent variables:
- 1-2. MWG, NWG: per-matrix 2D tiling at workgroup level: {16, 32, 64, 128} (integer)
- 3. KWG: inner dimension of 2D tiling at workgroup level: {16, 32} (integer)
- 4-5. MDIMC, NDIMC: local workgroup size: {8, 16, 32} (integer)
- 6-7. MDIMA, NDIMB: local memory shape: {8, 16, 32} (integer)
- 8. KWI: kernel loop unrolling factor: {2, 8} (integer)
- 9-10. VWM, VWN: per-matrix vector widths for loading and storing: {1, 2, 4, 8} (integer)
- 11-12. STRM, STRN: enable stride for accessing off-chip memory within a single thread: {0, 1} (categorical)
- 13-14. SA, SB: per-matrix manual caching of the 2D workgroup tile: {0, 1} (categorical)

Output:
- 15-18. Run1, Run2, Run3, Run4: performance times in milliseconds for 4 independent runs using the same parameters. They range between 13.25 and 3397.08.

# Exploratory Data Analysis

Read data from our csv file.

In [None]:
df = pd.read_csv('/kaggle/input/gpu-runtime/sgemm_product.csv')
sns.set()

First of all we will check if there are any nulls in our dataset.

In [None]:
print("Number of nulls for each column:")
print(df.isnull().sum())
print(df.shape)

We see that we have no nulls, so it will not be necessary to do any null value treatment.

We opt to simplify the target variable in just one column because the four runs are from the same program.

In [None]:
df['Runtime']=df[['Run1 (ms)','Run2 (ms)','Run3 (ms)','Run4 (ms)']].mean(axis=1)
df = df.drop(columns =['Run1 (ms)','Run2 (ms)','Run3 (ms)','Run4 (ms)'], axis = 1)
df.head()

We can also check the type of our features using the function info from pandas.

In [None]:
df.info()

We see as checked before that there are no nulls and also every feature is an int, except our target that is float, so it should not be necessary to do any special feature treatment. 

We can also visualize the distribution of each feature.

In [None]:
sns.set()
df.hist(figsize=(14,16))

And as expected, we can see that every int variable is categorical while the target is continuous. We also realize that the target variable has some values that are worth studying in case they were outliers.

Additionally, we can see some more basic stats using the function describe from pandas.

In [None]:
df.describe().T

# Preprocessing

### Outliers

We see that although the Runtime mean is 217.57 and the median just 69.79, its maximum value is 3341.51, so it is worth checking if there are any outliers in our data.

Lets check it with a boxplot and remove it if it is necessary.

<img src="https://miro.medium.com/max/8000/1*0MPDTLn8KoLApoFvI0P2vQ.png" width="500" align="left">

In [None]:
sns.boxplot(x=df['Runtime']);

We opt to not take into account those values that form part of the outliers.

In [None]:
Q1=df['Runtime'].quantile(0.25)
Q3=df['Runtime'].quantile(0.75)
IQR = Q3 - Q1
MIN=Q1-1.5*IQR
MAX=Q3+1.5*IQR
df = df[(df.Runtime>MIN) & (df.Runtime<MAX)]
df.describe().T

In [None]:
sns.boxplot(x=df['Runtime']);

Now, without the outliers, we can check the distribution of our target.

In [None]:
sns.distplot(df['Runtime'])

### Target transformation

As we see in the Runtime histogram we have a logarithmic distribution, so it is a good idea to opt for a logarithmic transformation.

In [None]:
df['target']=np.log(df.Runtime)
sns.distplot(df['target'])

This way we achieve a normal distribution of our target variable.

As a result, now we can visualize a heatmap that will show us the correlation between our features and our target variable.

In [None]:
plt.figure(figsize=(14,14))
ax = sns.heatmap(df.corr(),annot=True, linewidths=.5, cmap=plt.cm.Blues)
plt.title('Variable Correlation')

We can also visualize the correlation with our target variable in a sorted and clearer way.

In [None]:
plt.figure(figsize=(8, 12))
heatmap = sns.heatmap(df.corr()[['target']].sort_values(by='target', ascending=False), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Features Correlating with target', fontdict={'fontsize':18}, pad=16);

### Normalitzation

So as to normalize our data and bring all the variables to the same range we should use an scaler. Due to not having outliers we might have no problem to use MinMaxScaler from sklearn. MinMaxScaler rescales the data set such that all feature values are in the range [0,1].

First of all we have to split our data into features (X) and target (Y).

In [None]:
df_target = df[['target']]
Y = df_target.to_numpy().ravel()
df_features = df.drop(columns=['target','Runtime'])
X = df_features.to_numpy()

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
print(X)

# Regression

We will apply a regression model using Stochastic Gradient Descent from sklearn.

### Learning rate

First of all we are going to study the affect of the parameter alpha (learning rate). 

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25)
list_alpha = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
list_MSE = []

for i in list_alpha:
    regr = SGDRegressor(alpha = i)
    regr.fit(X_train, Y_train)
    Y_pred = regr.predict(X_test)
    MSE = mean_squared_error(Y_test, Y_pred)
    list_MSE.append(MSE)
    print(f'MSE with alpha={i}: {MSE}  /  N iterations to converge: {regr.n_iter_}')
    
plt.plot(list_alpha,list_MSE)
plt.xlabel('Alpha')
plt.ylabel('MSE')
plt.show()

As we can see the lower alpha we get the lower MSE but it usually takes longer to converge.

### Feature selection

So far we have trained our model with every feature we have, now we will try to train it with 8 random features, using 0.0001 as alpha. 

Additionally, the following experiments will be implemented using cross validation, with K-fold=5, so as to prevent possible overfitting and get more reliable results.

In [None]:
for i in range(0,10):
    X = df_features.sample(axis = 1,random_state=i,n=8) 
    print(f'Using features: {X.columns}')
    X = X.to_numpy()
    X = scaler.fit_transform(X)

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25)

    regr = SGDRegressor(alpha = 0.0001)
    MSE = cross_val_score(regr, X_train, Y_train, cv=5, scoring = "neg_mean_squared_error").mean()
    MSE = abs(MSE)
    print(f'MSE: {MSE}')

We see that some features set have a higher error than others, so now we are going to study the relevance of each feature training our model from the most correlated feature to the least.

Firstly we will train it with just the most correlated feature, and in each iteration we will add the next most correlated until we have the whole dataset.

In [None]:
best_features = ['MWG', 'SA','NWG',  'VWM','MDIMC', 'NDIMC','SB', 'STRM','NDIMB' ,'VWN', 'KWI','MDIMA','KWG', 'STRN']

In [None]:
list_MSE = []

for i in range(1,16):
    X = df_features[best_features[:i]]
    print(f'Using features: {X.columns}')
    X = X.to_numpy()
    X = scaler.fit_transform(X)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25)

    regr = SGDRegressor(alpha = 0.0001)
    MSE = cross_val_score(regr, X_train, Y_train, cv=5, scoring = "neg_mean_squared_error").mean()
    MSE = abs(MSE)
    list_MSE.append(MSE)
    print(f'MSE: {MSE}')
    
plt.plot(range(1,16),list_MSE)
plt.xlabel('N features')
plt.ylabel('MSE')
plt.show()

As we can see once we reach the seven most correlated features together we get the lowest error, and the rest of the features do not affect the performance. So now we could opt just to train our model with less features.

Furthermore, to prove that the seven most correlated features are the ones that imply the best performance we can also try the same experiment but beginning with the least correlated.

In [None]:
best_features.reverse()
list_MSE = []

for i in range(1,16):
    X = df_features[best_features[:i]]
    print(f'Using features: {X.columns}')
    X = X.to_numpy()
    X = scaler.fit_transform(X)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.25)

    regr = SGDRegressor(alpha = 0.0001)
    MSE = cross_val_score(regr, X_train, Y_train, cv=5, scoring = "neg_mean_squared_error").mean()
    MSE = abs(MSE)
    list_MSE.append(MSE)
    print(f'MSE: {MSE}')
    
plt.plot(range(1,16),list_MSE)
plt.xlabel('N features')
plt.ylabel('MSE')
plt.show()

And as proven before we see that it is when we begin to use the most correlated features that the error gets lower.

Unfortunately we do not achieve a lower error than the one with every feature, but with less features to train our model we may reduce our training time.

# Classification

First of all we have to convert this problem into a multi-classification problem, we will assign each value to the quartile it is part of.

In [None]:
X = scaler.fit_transform(df_features)
df_target = pd.qcut(df['target'].values, q=4, labels=False)
Y = df_target.ravel()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.33)

We should check if our dataset is balanced so as to know which metrics will be more reliable.

In [None]:
unique, counts = np.unique(Y, return_counts=True)
print(dict(zip(unique, counts)))

### Model selection

In the following experiment we will evaluate the following models:
- Decision Tree
- Nearest Neighbor
- Logistic Regression
- Random Forest

#### Decision Tree

In [None]:
tree_classifier = tree.DecisionTreeClassifier()
tree_score = cross_val_score(tree_classifier, X_train, Y_train, cv = 5, scoring="accuracy")
score = tree_score.mean()
print(f'Cross validation accuracy with K-fold=5: {score}')

tree_classifier.fit(X_train, Y_train)
Y_pred = tree_classifier.predict(X_test)

print(f'Test accuracy: {metrics.accuracy_score(Y_test, Y_pred)}')

disp = metrics.plot_confusion_matrix(tree_classifier, X_test, Y_test, cmap=plt.cm.Blues)
disp.ax_.set_title('Test Results')

#### Nearest Neighbor

In [None]:
knn_classifier = neighbors.KNeighborsClassifier(n_neighbors=5)
knn_score = cross_val_score(knn_classifier, X_train, Y_train, cv = 5, scoring="accuracy")
score = knn_score.mean()
print(f'Cross validation accuracy with K-fold=5: {score}')

knn_classifier.fit(X_train, Y_train)
Y_pred = knn_classifier.predict(X_test)

print(f'Test accuracy: {metrics.accuracy_score(Y_test, Y_pred)}')

disp = metrics.plot_confusion_matrix(knn_classifier, X_test, Y_test, cmap=plt.cm.Blues)
disp.ax_.set_title('Test Results')

We suppose that Nearest Neighbor performance is worse because of the known "curse of dimensionality", due to the high number of features it is difficult to find the right weights and to determine which features are not important for classification. 

Furthermore, Nearest Neighbor is a distance-based algorithm, so when working with large datasets its performance degrades.

#### Logistic Regression

In [None]:
logistic_classifier = LogisticRegression()
logistic_score = cross_val_score(logistic_classifier, X_train, Y_train, cv = 5, scoring="accuracy")
score = logistic_score.mean()
print(f'Cross validation accuracy with K-fold=5: {score}')

logistic_classifier.fit(X_train, Y_train)
Y_pred = logistic_classifier.predict(X_test)

print(f'Test accuracy: {metrics.accuracy_score(Y_test, Y_pred)}')

disp = metrics.plot_confusion_matrix(logistic_classifier, X_test, Y_test, cmap=plt.cm.Blues)
disp.ax_.set_title('Test Results')

We also see that Logistic Regression performance is the worse so far.

Logistic Regression only estimates a linear boundary. So, when there is non-linear separation of labels, Logistic regression could fail badly. As a result we can estimate that our classification is non-linear separable.

#### Random Forest

In [None]:
random_classifier = RandomForestClassifier()
random_score = cross_val_score(random_classifier, X_train, Y_train, cv = 5, scoring="accuracy")
score = random_score.mean()
print(f'Cross validation accuracy with K-fold=5: {score}')

random_classifier.fit(X_train, Y_train)
Y_pred = random_classifier.predict(X_test)

print(f'Test accuracy: {metrics.accuracy_score(Y_test, Y_pred)}')

disp = metrics.plot_confusion_matrix(random_classifier, X_test, Y_test, cmap=plt.cm.Blues)
disp.ax_.set_title('Test Results')

In [None]:
my_dict = {'Tree': tree_score, '5-NN': knn_score, 'Logisitic':logistic_score, 'Forest': random_score}

fig, ax = plt.subplots()
ax.boxplot(my_dict.values())
ax.set_xticklabels(my_dict.keys())
plt.ylabel('Accuracy')

We can conclude that Decision Tree and Random Forest give the best accuracy because every feature is categorical.

Even though Random Forest is an ensemble method from Decision Tree, its performance is not better, so we will opt to perform an hyperparameter search over Random Forest to know if it can get a better accuracy.

### Hyperparameter Search

In [None]:
parameters = {'criterion':['gini', 'entropy'],
              'max_features':[0.1, 0.3, 0.5],
              'n_estimators': range(50, 201, 50)}

In [None]:
rf = RandomForestClassifier()
clf = GridSearchCV(estimator = rf,  param_grid = parameters,scoring = 'accuracy',cv = 5,verbose=0)
grid_search = clf.fit(X_train, Y_train)

# best score achieved during the GridSearchCV
print('GridSearch CV best score : {:.4f}\n'.format(grid_search.best_score_))

# print parameters that give the best results
print('Parameters that give the best results :','\n', (grid_search.best_params_))

# print estimator that was chosen by the GridSearch
print('\n\nEstimator that was chosen by the search :','\n', (grid_search.best_estimator_))