In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.impute import SimpleImputer


# read in our data
data = pd.read_csv("ot.csv")

# tidy up column names
data.columns = data.columns.str.strip()

# replace , with .
data = data.stack().str.replace(',','.').unstack()

# drop columns with missing values
data = data.dropna(axis=1)

In [2]:
data

Unnamed: 0,Spatial Distribution,Temporal Distribution,T/R,Processor Utilization,Channel Waiting Time,Input Waiting Time,Network Response Time,Channel Utilization
0,UN,Client-Server,0.1,0.839546,1.974686,308.491814,700.514102,0.352431
1,UN,Client-Server,0.2,0.827412,9.556437,291.037663,864.599227,0.506302
2,UN,Client-Server,0.3,0.802605,27.027618,264.928002,839.372851,0.638516
3,UN,Client-Server,0.4,0.723403,61.848511,235.776888,1256.053108,0.767051
4,UN,Client-Server,0.5,0.72121,121.085884,189.680044,1343.875577,0.807812
...,...,...,...,...,...,...,...,...
635,PS,Asynchronous,0.6,0.79234,532.737732,363.094043,1214.057768,0.961042
636,PS,Asynchronous,0.7,0.707199,751.67511,224.638088,1653.736882,0.981944
637,PS,Asynchronous,0.8,0.61823,905.326843,206.922631,1421.304936,0.9625
638,PS,Asynchronous,0.9,0.564482,1048.269897,154.713952,2371.043062,0.979375


In [3]:
data['Spatial Distribution'].unique()

array(['UN', 'HR', 'BR', 'PS'], dtype=object)

In [4]:
data['Temporal Distribution'].unique()

array(['Client-Server', 'Asynchronous'], dtype=object)

In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['Temporal Distribution'] = le.fit_transform(data['Temporal Distribution'])

In [6]:
dic = {'UN':0, 'HR':1, 'BR':2, 'PS':3}
data['Spatial Distribution']  = data['Spatial Distribution'].map(dic)

In [7]:
data

Unnamed: 0,Spatial Distribution,Temporal Distribution,T/R,Processor Utilization,Channel Waiting Time,Input Waiting Time,Network Response Time,Channel Utilization
0,0,1,0.1,0.839546,1.974686,308.491814,700.514102,0.352431
1,0,1,0.2,0.827412,9.556437,291.037663,864.599227,0.506302
2,0,1,0.3,0.802605,27.027618,264.928002,839.372851,0.638516
3,0,1,0.4,0.723403,61.848511,235.776888,1256.053108,0.767051
4,0,1,0.5,0.72121,121.085884,189.680044,1343.875577,0.807812
...,...,...,...,...,...,...,...,...
635,3,0,0.6,0.79234,532.737732,363.094043,1214.057768,0.961042
636,3,0,0.7,0.707199,751.67511,224.638088,1653.736882,0.981944
637,3,0,0.8,0.61823,905.326843,206.922631,1421.304936,0.9625
638,3,0,0.9,0.564482,1048.269897,154.713952,2371.043062,0.979375


In [8]:
data = data.reindex(columns=['Temporal Distribution', 'T/R',
       'Processor Utilization', 'Channel Waiting Time', 'Input Waiting Time',
       'Network Response Time', 'Channel Utilization','Spatial Distribution'])

In [9]:
data

Unnamed: 0,Temporal Distribution,T/R,Processor Utilization,Channel Waiting Time,Input Waiting Time,Network Response Time,Channel Utilization,Spatial Distribution
0,1,0.1,0.839546,1.974686,308.491814,700.514102,0.352431,0
1,1,0.2,0.827412,9.556437,291.037663,864.599227,0.506302,0
2,1,0.3,0.802605,27.027618,264.928002,839.372851,0.638516,0
3,1,0.4,0.723403,61.848511,235.776888,1256.053108,0.767051,0
4,1,0.5,0.72121,121.085884,189.680044,1343.875577,0.807812,0
...,...,...,...,...,...,...,...,...
635,0,0.6,0.79234,532.737732,363.094043,1214.057768,0.961042,3
636,0,0.7,0.707199,751.67511,224.638088,1653.736882,0.981944,3
637,0,0.8,0.61823,905.326843,206.922631,1421.304936,0.9625,3
638,0,0.9,0.564482,1048.269897,154.713952,2371.043062,0.979375,3


In [10]:
data.columns

Index(['Temporal Distribution', 'T/R', 'Processor Utilization',
       'Channel Waiting Time', 'Input Waiting Time', 'Network Response Time',
       'Channel Utilization', 'Spatial Distribution'],
      dtype='object')

In [11]:
X = ['Temporal Distribution', 'T/R',
       'Processor Utilization', 'Channel Waiting Time', 'Input Waiting Time',
       'Network Response Time', 'Channel Utilization']
X = data[X].values
y = data['Spatial Distribution']

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 640 entries, 0 to 639
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Temporal Distribution  640 non-null    int32 
 1   T/R                    640 non-null    object
 2   Processor Utilization  640 non-null    object
 3   Channel Waiting Time   640 non-null    object
 4   Input Waiting Time     640 non-null    object
 5   Network Response Time  640 non-null    object
 6   Channel Utilization    640 non-null    object
 7   Spatial Distribution   640 non-null    int64 
dtypes: int32(1), int64(1), object(6)
memory usage: 42.5+ KB


In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state = 42)

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Random Forest

In [15]:
pipeline1 = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=6)),
    ('classifier', RandomForestClassifier())
])

pipeline1.fit(X_train, y_train)

y_pred = pipeline1.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6510416666666666


In [33]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=5)),
    ('classifier', RandomForestClassifier())
])

# Define the parameter grid to search over
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [5, 10, None],
    'classifier__min_samples_leaf': [1, 2, 4],
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Evaluate the best model on the testing set
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Best parameters:", best_params)
print("Best score:", best_score)
print("Testing accuracy:", accuracy)

Best parameters: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__n_estimators': 100}
Best score: 0.6162546816479401
Testing accuracy: 0.625


# Svc

In [55]:
pipe = Pipeline([
    ('scaler', StandardScaler()),                      #first it will standardize the data then 
    ('pca', PCA(n_components = 5)),                                    #it will reduce the dimensions
    ('classifier', SVC())                              #then it will apply svm taking too much time
])
param_grid = {
    'pca__n_components': [3, 6, 7],
    'classifier__C': [1, 4, 17],
    'classifier__kernel': ['linear','rbf']
}

# Create a random forest classifier
# rf = RandomForestClassifier()

# Use grid search cross-validation to find the best hyperparameters
grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=2)
grid_search.fit(X, y)
y_pred = grid_search.predict(X_test)
print("Testing Accuracy score:", accuracy_score(y_test, y_pred)*100)
y_pred_train = grid_search.predict(X_train)

# Compute the accuracy score for the training set
train_accuracy = accuracy_score(y_train, y_pred_train)
print("Training accuracy score:", train_accuracy*100)

Testing Accuracy score: 86.45833333333334
Training accuracy score: 85.04464285714286


# KNeighborsClassifier

In [56]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=6)),
    ('classifier', KNeighborsClassifier())
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.515625


In [57]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=6)),
    ('classifier', KNeighborsClassifier())
])

# Define the parameter grid to search over
param_grid = {
    'classifier__n_neighbors': [3, 5, 7],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__leaf_size': [10, 20, 30],
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best parameters and score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Evaluate the best model on the testing set
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Best parameters:", best_params)
print("Best score:", best_score)
print("Testing accuracy:", accuracy)

Best parameters: {'classifier__leaf_size': 10, 'classifier__n_neighbors': 7, 'classifier__weights': 'distance'}
Best score: 0.5066167290886392
Testing accuracy: 0.5208333333333334


In [58]:
grid_search.predict([[0,0.6,0.79234,532.737732,363.094043,1214.057768,0.961042]])


array([3], dtype=int64)