# Instance Family Type Selection Model

In [29]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as md
import numpy as np
from datetime import date
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import svm, datasets
import statsmodels.api as sm
from scipy import stats
from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [9]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
df_family = pd.read_csv('/content/gdrive/My Drive/Research/AWS/Instance_Family.csv')
df_family.head()

Mounted at /content/gdrive


Unnamed: 0,Instance_Family,Processing_Power,Memory_Usage,Network,Storage,Graphical_Processing
0,General Purpose,12,32.0,6,Low,Low
1,General Purpose,2,0.5,3,Low,Low
2,General Purpose,2,1.0,3,Low,Low
3,General Purpose,2,2.0,3,Low,Low
4,General Purpose,2,4.0,3,Low,Low


In [10]:
df_family.dtypes

Instance_Family          object
Processing_Power          int64
Memory_Usage            float64
Network                   int64
Storage                  object
Graphical_Processing     object
dtype: object

In [11]:
df_family['Storage'].value_counts()

Low       524
Normal    431
High      231
Name: Storage, dtype: int64

In [12]:
storage_types = {
                 'Low':1,
                 'Normal':2,
                 'High':3
                 }  
graphical_types = {
                 'Low':1,
                 'Normal':2,
                 'High':3
                 }                  

df_family['Storage'] = df_family.Storage.map (storage_types)
df_family['Graphical_Processing'] = df_family.Graphical_Processing.map (graphical_types)
df_family

Unnamed: 0,Instance_Family,Processing_Power,Memory_Usage,Network,Storage,Graphical_Processing
0,General Purpose,12,32.0,6,1,1
1,General Purpose,2,0.5,3,1,1
2,General Purpose,2,1.0,3,1,1
3,General Purpose,2,2.0,3,1,1
4,General Purpose,2,4.0,3,1,1
...,...,...,...,...,...,...
1181,Accelerated Computing,32,244.0,7,2,3
1182,Accelerated Computing,64,488.0,10,2,3
1183,Accelerated Computing,8,122.0,5,2,3
1184,Accelerated Computing,16,244.0,5,2,3


In [13]:
import sklearn.model_selection as ms
from sklearn import linear_model
from sklearn import preprocessing

In [14]:
label_encoder = preprocessing.LabelEncoder()
df_family['Instance_Family'] = label_encoder.fit_transform(df_family['Instance_Family'])
df_family.head()

Unnamed: 0,Instance_Family,Processing_Power,Memory_Usage,Network,Storage,Graphical_Processing
0,2,12,32.0,6,1,1
1,2,2,0.5,3,1,1
2,2,2,1.0,3,1,1
3,2,2,2.0,3,1,1
4,2,2,4.0,3,1,1


In [15]:
y = np.asarray(df_family['Instance_Family'])
print(y)

X = df_family.iloc[:,1:].values
X

[2 2 2 ... 0 0 0]


array([[1.20e+01, 3.20e+01, 6.00e+00, 1.00e+00, 1.00e+00],
       [2.00e+00, 5.00e-01, 3.00e+00, 1.00e+00, 1.00e+00],
       [2.00e+00, 1.00e+00, 3.00e+00, 1.00e+00, 1.00e+00],
       ...,
       [8.00e+00, 1.22e+02, 5.00e+00, 2.00e+00, 3.00e+00],
       [1.60e+01, 2.44e+02, 5.00e+00, 2.00e+00, 3.00e+00],
       [6.40e+01, 9.76e+02, 1.00e+01, 2.00e+00, 3.00e+00]])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [17]:
from sklearn.metrics import classification_report
from sklearn import metrics

In [18]:
rbf = svm.SVC(kernel='rbf', gamma='auto', C=4, decision_function_shape='ovo').fit(X_train, y_train)
rbf_pred = rbf.predict(X_test)
print(classification_report(y_test, rbf_pred))
print("Accuracy: " , metrics.accuracy_score(y_test, rbf_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        69
           1       1.00      1.00      1.00        37
           2       1.00      0.91      0.96        47
           3       0.91      1.00      0.95        40
           4       1.00      1.00      1.00        45

    accuracy                           0.98       238
   macro avg       0.98      0.98      0.98       238
weighted avg       0.98      0.98      0.98       238

Accuracy:  0.9831932773109243


In [19]:
poly = svm.SVC(kernel='poly', degree=5, C=1, decision_function_shape='ovo').fit(X_train, y_train)
poly_pred = poly.predict(X_test)
print(classification_report(y_test, poly_pred))
print("Accuracy: " , metrics.accuracy_score(y_test, poly_pred))

              precision    recall  f1-score   support

           0       0.29      1.00      0.46        69
           1       0.00      0.00      0.00        37
           2       0.00      0.00      0.00        47
           3       1.00      0.10      0.18        40
           4       0.00      0.00      0.00        45

    accuracy                           0.31       238
   macro avg       0.26      0.22      0.13       238
weighted avg       0.25      0.31      0.16       238

Accuracy:  0.3067226890756303


  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
linear = svm.SVC(kernel='linear', C=1, decision_function_shape='ovo').fit(X_train, y_train)
linear_pred = linear.predict(X_test)
print(classification_report(y_test, linear_pred))
print("Accuracy: " , metrics.accuracy_score(y_test, linear_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        69
           1       1.00      1.00      1.00        37
           2       1.00      1.00      1.00        47
           3       1.00      1.00      1.00        40
           4       1.00      1.00      1.00        45

    accuracy                           1.00       238
   macro avg       1.00      1.00      1.00       238
weighted avg       1.00      1.00      1.00       238

Accuracy:  1.0


In [21]:
sig = svm.SVC(kernel='sigmoid', C=1, decision_function_shape='ovo').fit(X_train, y_train)
sig_pred = sig.predict(X_test)
print(classification_report(y_test, sig_pred))
print("Accuracy: " , metrics.accuracy_score(y_test, sig_pred))

              precision    recall  f1-score   support

           0       0.48      0.14      0.22        69
           1       0.33      0.19      0.24        37
           2       0.21      0.40      0.28        47
           3       0.19      0.50      0.28        40
           4       0.00      0.00      0.00        45

    accuracy                           0.24       238
   macro avg       0.24      0.25      0.20       238
weighted avg       0.26      0.24      0.20       238

Accuracy:  0.23529411764705882


  _warn_prf(average, modifier, msg_start, len(result))


According to Result, I can select Linear kernal is best prediction model for selecting Family Type

In [49]:
family_predict = linear_pred
x1 = [[2, 15, 5, 1,2]]
cus_pred2 = linear.predict(x1)
family_type = cus_pred2[0]

********************************************************************************************************************************************************************************************************************************************************************************************************************************
Next, We predict what is the best model and prediction result for Family type. 
So, we need to create 5 models for 5 Family types 
Lets begin,

# General Purpose Family Type


In [33]:
df_general = pd.read_csv('/content/gdrive/My Drive/Research/AWS/modify_date_general_purpose.csv')
df_general = df_general.loc[df_general.Instance_Family == "General purpose"]
# dummy data for newly added Instance types
df_dummy = pd.read_csv('/content/gdrive/My Drive/Research/AWS/Instance_Type_02.csv')
df_general_table = df_general.copy()
df_general.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,EffectiveDates,PricePerUnit,Project_Type,Project_Size,LeaseContractLength,PurchaseOption,Unnamed: 6,OfferingClass,Location,Instance_Type,Current_Generation,Unnamed: 11,Instance_Family,vCPU,Clock_Speed,Memory,Storage,Network_Performance,Operating_System,Tenancy,usageType,Dedicated EBS Throughput,Normalization Size Factor,Processor Features
0,4/30/2017,0.01,,medium,1,No Upfront,,standard,US East (Ohio),m4.xlarge,Yes,,General purpose,4.0,2.4 GHz,16.0,EBS,High,RHEL,Dedicated,USE2-DedicatedUsage:m4.xlarge,750 Mbps,8.0,Intel AVX; Intel AVX2; Intel Turbo
1,3/31/2018,0.01,,medium,3,Partial Upfront,,convertible,US East (N. Virginia),m4.large,Yes,,General purpose,2.0,2.4 GHz,8.0,EBS,Moderate,Linux,Dedicated,DedicatedUsage:m4.large,450 Mbps,4.0,Intel AVX; Intel AVX2; Intel Turbo
4,7/1/2019,0.01,,medium,3,All Upfront,,standard,US West (N. California),m3.xlarge,No,,General purpose,4.0,2.5 GHz,15.0,SSD,High,RHEL,Shared,USW1-BoxUsage:m3.xlarge,,8.0,Intel AVX; Intel Turbo
5,3/1/2019,0.01,,medium,3,All Upfront,,convertible,US East (N. Virginia),m5ad.2xlarge,Yes,,General purpose,8.0,2.5 GHz,32.0,SSD,Up to 10,Windows,Shared,BoxUsage:m5ad.2xlarge,Upto 2120 Mbps,16.0,"AVX, AVX2, AMD Turbo"
8,10/1/2018,0.01,,medium,1,AllUpfront,,standard,US West (Oregon),a1,Yes,,General purpose,16.0,2.3 GHz,,EBS,,Linux,Reserved,USW2-ReservedHostUsage:a1,,,


In [34]:
df_dummy

Unnamed: 0,Instance_Type,Project_Type,Project_Size,vCPU,Memory,Network_Performance
0,AWS_T4g,1,small,2,0.5,3
1,AWS_T4g,2,small,2,1.0,3
2,AWS_T4g,5,small,2,2.0,3
3,AWS_T4g,7,small,2,4.0,3
4,AWS_T4g,8,small,2,8.0,3
...,...,...,...,...,...,...
1511,AWS_M5a,12,medium,16,64.0,5
1512,AWS_M5a,12,medium,32,128.0,5
1513,AWS_M5a,12,large,48,192.0,7
1514,AWS_M5a,12,large,64,256.0,8


In [35]:
df_general["Memory"] = df_general["Memory"].astype("float64")
df_general["EffectiveDates"] = df_general["EffectiveDates"].astype("datetime64")
df_general["Project_Type"] = df_general["Project_Type"].astype(str)
df_general["Storage"] = df_general["Storage"].astype(str)
df_general["Processor_Archi"] = df_general["Storage"].astype(str)

columns = ['Instance_Type', 'Project_Type', 'Project_Size', 'vCPU', 'Memory', 'Network_Performance']
df_general = df_general.reindex(columns=columns)
df1 = df_general[columns]

df1['Instance_Type'] = df1['Instance_Type'].replace({'m4.xlarge': 'AWS_M4', 'm4.large': 'AWS_M4', 'm4.2xlarge': 'AWS_M4', 'm4.4xlarge': 'AWS_M4', 'm4.10xlarge': 'AWS_M4', 'm4.16xlarge': 'AWS_M4', 'm4': 'AWS_M4'})

df1['Instance_Type'] = df1['Instance_Type'].replace({'m3.xlarge': 'AWS_M4', 'm3.large': 'AWS_M4', 'm3.2xlarge': 'AWS_M4', 'm3.4xlarge': 'AWS_M4', 'm3.10xlarge': 'AWS_M4', 'm3.16xlarge': 'AWS_M4', 'm3': 'AWS_M4', 'm3.medium': 'AWS_M4'})

df1['Instance_Type'] = df1['Instance_Type'].replace({'m5n.large': 'AWS_M5n', 'm5n.xlarge': 'AWS_M5n','m5n.2xlarge': 'AWS_M5n', 'm5n.4xlarge': 'AWS_M5n','m5n.8large': 'AWS_M5n', 'm5n.12xlarge': 'AWS_M5n', 'm5n.16xlarge': 'AWS_M5n','m5n.24large': 'AWS_M5n', 'm5dn.large': 'AWS_M5n', 'm5dn.xlarge': 'AWS_M5n', 'm5dn.2xlarge': 'AWS_M5n', 'm5dn.4xlarge': 'AWS_M5n', 'm5dn.8xlarge': 'AWS_M5n', 'm5dn.12xlarge': 'AWS_M5n', 'm5dn.16xlarge': 'AWS_M5n', 'm5dn.24xlarge': 'AWS_M5n'})


df1['Instance_Type'] = df1['Instance_Type'].replace({'m5a.large': 'AWS_M5a', 'm5a.xlarge': 'AWS_M5a','m5a.2xlarge': 'AWS_M5a', 'm5a.4xlarge': 'AWS_M5a','m5a.8xlarge': 'AWS_M5a', 'm5a.12xlarge': 'AWS_M5a', 'm5a.16xlarge': 'AWS_M5a','m5a.24xlarge': 'AWS_M5a', 'm5ad.large': 'AWS_M5a', 'm5ad.xlarge': 'AWS_M5a', 'm5ad.2xlarge': 'AWS_M5a', 'm5ad.4xlarge': 'AWS_M5a', 'm5ad.8xlarge': 'AWS_M5a', 'm5ad.12xlarge': 'AWS_M5a', 'm5ad.16xlarge': 'AWS_M5a', 'm5ad.24xlarge': 'AWS_M5a'})


df1['Instance_Type'] = df1['Instance_Type'].replace({'m5.large': 'AWS_M5', 'm5.xlarge': 'AWS_M5','m5.2xlarge': 'AWS_M5', 'm5.4xlarge': 'AWS_M5','m5.8xlarge': 'AWS_M5', 'm5.12xlarge': 'AWS_M5', 'm5.16xlarge': 'AWS_M5','m5.24xlarge': 'AWS_M5a', 'm5.metal': 'AWS_M5', 'm5d.large': 'AWS_M5', 'm5d.xlarge': 'AWS_M5', 'm5d.2xlarge': 'AWS_M5', 'm5d.4xlarge': 'AWS_M5', 'm5d.8xlarge': 'AWS_M5', 'm5d.12xlarge': 'AWS_M5', 'm5d.16xlarge': 'AWS_M5', 'm5d.24xlarge': 'AWS_M5', 'm5d.metal': 'AWS_M5', 'm5': 'AWS_M5', 'm5d': 'AWS_M5'})


df1['Instance_Type'] = df1['Instance_Type'].replace({'t2.nano': 'AWS_T2', 't2.micro': 'AWS_T2','t2.small': 'AWS_T2', 't2.medium': 'AWS_T2','t2.large': 'AWS_T2', 't2.xlarge': 'AWS_T2', 't2.2xlarge': 'AWS_T2'})

df1['Instance_Type'] = df1['Instance_Type'].replace({'t3a.nano': 'AWS_T3a', 't3a.micro': 'AWS_T3a','t3a.small': 'AWS_T3a', 't3a.medium': 'AWS_T3a','t3a.large': 'AWS_T3a', 't3a.xlarge': 'AWS_T3a', 't3a.2xlarge': 'AWS_T3a'})


df1['Instance_Type'] = df1['Instance_Type'].replace({'t3.nano': 'AWS_T3', 't3.micro': 'AWS_T3','t3.small': 'AWS_T3', 't3.medium': 'AWS_T3','t3.large': 'AWS_T3', 't3.xlarge': 'AWS_T3', 't3.2xlarge': 'AWS_T3'})


df1['Instance_Type'] = df1['Instance_Type'].replace({'t4g.nano': 'AWS_T4g', 't4g.micro': 'AWS_T4g','t4g.small': 'AWS_T4g', 't4g.medium': 'AWS_T4g','t4g.large': 'AWS_T4g', 't4g.xlarge': 'AWS_T4g', 't4g.2xlarge': 'AWS_T4g'})


df1['Instance_Type'] = df1['Instance_Type'].replace({'a1.medium': 'AWS_A1', 'a1.large': 'AWS_A1','a1.xlarge': 'AWS_A1', 'a1.2xlarge': 'AWS_A1','a1.4xlarge': 'AWS_A1', 'a1.metal': 'AWS_A1', 'a1': 'AWS_A1'})

df1['Instance_Type'] = df1['Instance_Type'].replace({'m6g.medium': 'AWS_M6g', 'm6g.large': 'AWS_M6g', 'm6g.xlarge': 'AWS_M6g','m6g.2xlarge': 'AWS_M6g', 'm6g.4xlarge': 'AWS_M6g','m6gd.8xlarge': 'AWS_M6g', 'm6gd.12xlarge': 'AWS_M6g', 'm6gd.16xlarge': 'AWS_M6g','m6gd.metal': 'AWS_M6g','m6gd.medium': 'AWS_M6g', 'm6gd.large': 'AWS_M6g', 'm6gd.xlarge': 'AWS_M6g', 'm6gd.2xlarge': 'AWS_M6g', 'm6gd.4xlarge': 'AWS_M6g', 'm6gd.8xlarge': 'AWS_M6g', 'm6gd.12xlarge': 'AWS_M6g', 'm6gd.16xlarge': 'AWS_M6g', 'm6gd.24xlarge': 'AWS_M6g', 'm6gd.metal': 'AWS_M6g', 'm1.small': 'AWS_M6g', 'm1.medium': 'AWS_M6g', 'm1.large': 'AWS_M6g', 'm1.xlarge': 'AWS_M6g'})

df1.head()

Unnamed: 0,Instance_Type,Project_Type,Project_Size,vCPU,Memory,Network_Performance
0,AWS_M4,,medium,4.0,16.0,High
1,AWS_M4,,medium,2.0,8.0,Moderate
4,AWS_M4,,medium,4.0,15.0,High
5,AWS_M5a,,medium,8.0,32.0,Up to 10
8,AWS_A1,,medium,16.0,,


In [37]:
def fixNetworkPerformanceIssue (row):
   if (row['Instance_Type'] == 'AWS_A1') :
      return 5
   if (row['Instance_Type'] == 'AWS_T3a') :
      return 3
   if (row['Instance_Type'] == 'AWS_T3') :
      return 3
   if (row['Instance_Type'] == 'AWS_T4g') :
      return 3   
   if (row['Instance_Type'] == 'AWS_M4') :
      if (row['vCPU'] >= 0 and row['vCPU'] < 4) :
        return 4  
      if (row['vCPU'] >= 4 and row['vCPU'] < 40) :
        return 6
      if (row['vCPU'] >= 40 and row['vCPU'] < 64) :
        return 7    
      if (row['vCPU'] >= 64) :
        return 10    
   if (row['Instance_Type'] == 'AWS_M5a') :
      if (row['vCPU'] >= 0 and row['vCPU'] < 48) :
        return 5
      if (row['vCPU'] >= 48 and row['vCPU'] < 64) :
        return 7
      if (row['vCPU'] >= 64 and row['vCPU'] < 96) :
        return 8    
      if (row['vCPU'] >= 96) :
        return 9 
   if (row['Instance_Type'] == 'AWS_M6g') :
      if (row['vCPU'] >= 0 and row['vCPU'] < 32) :
        return 5 
      if (row['vCPU'] >= 32 and row['vCPU'] < 48) :
        return 8
      if (row['vCPU'] >= 48 and row['vCPU'] < 64) :
        return 9    
      if (row['vCPU'] >= 64) :
        return 10
   if (row['Instance_Type'] == 'AWS_M5') :
      if (row['vCPU'] >= 0 and row['vCPU'] < 32) :
        return 5 
      if (row['vCPU'] >= 32 and row['vCPU'] < 48) :
        return 7 
      if (row['vCPU'] >= 48 and row['vCPU'] < 64) :
        return 8    
      if (row['vCPU'] >= 64 and row['vCPU'] < 96) :
        return 9     
      if (row['vCPU'] >= 96) :
        return 10      
   if (row['Instance_Type'] == 'AWS_T2') :
      if (row['vCPU'] >= 0 and row['vCPU'] < 1) :
        return 1  
      if (row['vCPU'] >= 1 and row['vCPU'] < 4) :
        return 2 
      if (row['vCPU'] >= 4) :
        return 4
   else :
      return row['Network_Performance']       
    

def groupedMemory (row):
   if (row['Memory'] > 0 and row['Memory'] < 1) :
      return 1
   if (row['Memory'] >= 1 and row['Memory'] < 2) :
      return 2  
   if (row['Memory'] >= 2 and row['Memory'] < 4) :
      return 3
   if (row['Memory'] >= 4 and row['Memory'] < 8) :
      return 4
   if (row['Memory'] >= 8 and row['Memory'] < 16) :
      return 5
   if (row['Memory'] >= 16 and row['Memory'] < 32) :
      return 6
   if (row['Memory'] >= 32 and row['Memory'] < 64) :
      return 7
   if (row['Memory'] >= 64 and row['Memory'] < 96) :
      return 8
   if (row['Memory'] >= 96 and row['Memory'] < 128) :
      return 9
   if (row['Memory'] >= 128 and row['Memory'] < 160) :
      return 10 
   if (row['Memory'] >= 160 and row['Memory'] < 256) :
      return 11
   if (row['Memory'] >= 256 and row['Memory'] < 384) :
      return 12
   if (row['Memory'] >= 384) :
      return 13
   else :
      return row['Memory'] 


def groupedvCPU (row):
   if (row['vCPU'] > 0 and row['vCPU'] < 2) :
      return 1
   if (row['vCPU'] >= 2 and row['vCPU'] < 4) :
      return 2
   if (row['vCPU'] >= 4 and row['vCPU'] < 8) :
      return 3
   if (row['vCPU'] >= 8 and row['vCPU'] < 12) :
      return 4
   if (row['vCPU'] >= 12 and row['vCPU'] < 16) :
      return 5
   if (row['vCPU'] >= 16 and row['vCPU'] < 24) :
      return 6 
   if (row['vCPU'] >= 24 and row['vCPU'] < 32) :
      return 7
   if (row['vCPU'] >= 32 and row['vCPU'] < 48) :
      return 8 
   if (row['vCPU'] >= 48 and row['vCPU'] < 64) :
      return 9
   if (row['vCPU'] >= 64 and row['vCPU'] < 72) :
      return 10  
   if (row['vCPU'] >= 72 and row['vCPU'] < 96) :
      return 11
   if (row['vCPU'] >= 96) :
      return 12
   else :
      return row['vCPU'] 

def setMemory (row):
   if(pd.isnull(row['Memory'])) :
       if (row['vCPU'] == 1) :
          return 2
       if (row['vCPU'] == 2) :
          return 3
       if (row['vCPU'] == 3) :
          return 6
       if (row['vCPU'] == 4) :
          return 7
       if (row['vCPU'] == 5) :
          return 7
       if (row['vCPU'] == 6) :
          return 8  
       if (row['vCPU'] == 7) :
          return 8
       if (row['vCPU'] == 8) :
          return 10 
       if (row['vCPU'] == 9) :
          return 11
       if (row['vCPU'] == 10) :
          return 11
       if (row['vCPU'] == 11) :
          return 12
       if (row['vCPU'] == 12) :
          return 13          
   else :
       return row['Memory'] 


def setCPU (row):
   if(pd.isnull(row['vCPU'])) : 
       if (row['Memory'] == '0-1') :
          return '1-2'
       if (row['Memory'] == '1-2') :
          return '1-2'  
       if (row['Memory'] == '2-4') :
          return '1-2'
       if (row['Memory'] == '4-8') :
          return '2-4'
       if (row['Memory'] == '8-16') :
          return '2-4'
       if (row['Memory'] == '16-32') :
          return '4-8'
       if (row['Memory'] == '32-64') :
          return '12-16'
       if (row['Memory'] == '64-96') :
          return '24-32'
       if (row['Memory'] == '96-128') :
          return '48-64'
       if (row['Memory'] == '128-160') :
          return '32-48' 
       if (row['Memory'] == '160-256') :
          return '48-64'
       if (row['Memory'] == '256-384') :
          return '72-96'
       if (row['Memory'] == '384') :
          return '96' 
   else :
       return row['vCPU'] 

def setStorage (row):
       if (row['Instance_Type'] == 'AWS_A1' or row['Instance_Type'] == 'AWS_T4g' or row['Instance_Type'] == 'AWS_T3' or row['Instance_Type'] == 'AWS_T3a' or row['Instance_Type'] == 'AWS_T2' or row['Instance_Type'] == 'AWS_M4') :
          return 'EBS'
       if (row['Storage'] == '1 x 410' or row['Storage'] == '1 x 160' or row['Storage'] == '1 x 420' or row['Storage'] == '2 x 420') :   
          return 'SSD'
       else :
          return row['Storage']  

def setStorageSize (row):
       if (row['Storage'] == 'EBS') :
          return 1
       else :
          return 2              

def setProject_Size (row):
       if (row['Project_Size'] == 'small') :
          return 1
       if (row['Project_Size'] == 'medium') :
          return 2  
       if (row['Project_Size'] == 'large') :
          return 3     

def setProcessor_Archi (row):
       if (row['Instance_Type'] == 'AWS_A1' or row['Instance_Type'] == 'AWS_T4g' or row['Instance_Type'] == 'AWS_M6g') :
          return 2
       else :
          return 1                

def setProjectType (row):
   if (row['Instance_Type'] == 'AWS_A1' and row['vCPU'] == 1) :
      return 1
   if row['Instance_Type'] == 'AWS_A1' and row['vCPU'] == 2 :
      return 4
   if row['Instance_Type'] == 'AWS_A1' and row['vCPU'] == 3 :
      return 3
   if row['Instance_Type'] == 'AWS_A1' and (row['vCPU'] == 4 or row['vCPU'] == 5) :
      return 4
   if row['Instance_Type'] == 'AWS_A1' and row['vCPU'] > 5 :
      return 7
   if row['Instance_Type'] == 'AWS_A1' :
      return 7
   if row['Instance_Type'] == 'AWS_T4g' and row['vCPU'] == 1 and row['Memory'] <= 2 :
      return 1
   if row['Instance_Type'] == 'AWS_T4g' and (row['vCPU'] <= 2) and row['Memory'] <= 2 :
      return 2
   if row['Instance_Type'] == 'AWS_T4g' and (row['vCPU'] <= 2) and row['Memory'] <= 3 :
      return 5
   if row['Instance_Type'] == 'AWS_T4g' and (row['vCPU'] <= 2) and row['Memory'] <= 4 :
      return 7
   if row['Instance_Type'] == 'AWS_T4g' and (row['vCPU'] <= 2) and row['Memory'] <= 5 :
      return 8
   if row['Instance_Type'] == 'AWS_T4g' and row['vCPU'] <= 3 and row['Memory'] <= 6 :
      return 9
   if row['Instance_Type'] == 'AWS_T4g' and (row['vCPU'] > 3) and row['Memory'] > 6 :
      return 10
   if row['Instance_Type'] == 'AWS_T4g' :
      return 10
   if row['Instance_Type'] == 'AWS_T3' and row['vCPU'] <= 2 and row['Memory'] == 1 :
      return 1
   if row['Instance_Type'] == 'AWS_T3' and row['vCPU'] <= 2 and row['Memory'] <= 2 :
      return 2
   if row['Instance_Type'] == 'AWS_T3' and row['vCPU'] <= 2 and row['Memory'] <= 3 :
      return 5
   if row['Instance_Type'] == 'AWS_T3' and row['vCPU'] <= 2 and row['Memory'] <= 4 :
      return 7
   if row['Instance_Type'] == 'AWS_T3' and row['vCPU'] <= 2 and row['Memory'] <= 5 :
      return 8
   if row['Instance_Type'] == 'AWS_T3' and row['vCPU'] <= 3 and row['Memory'] <= 6 :
      return 9
   if row['Instance_Type'] == 'AWS_T3' and row['vCPU'] > 3 and row['Memory'] > 6 :
      return 10
   if row['Instance_Type'] == 'AWS_T3' :
      return 10
   if row['Instance_Type'] == 'AWS_T3a' and row['vCPU'] <= 2 and row['Memory'] == 1 :
      return 1
   if row['Instance_Type'] == 'AWS_T3a' and row['vCPU'] <= 2 and row['Memory'] <= 2 :
      return 2
   if row['Instance_Type'] == 'AWS_T3a' and row['vCPU'] <= 2 and row['Memory'] <= 3 :
      return 5
   if row['Instance_Type'] == 'AWS_T3a' and row['vCPU'] <= 2 and row['Memory'] <= 4 :
      return 7
   if row['Instance_Type'] == 'AWS_T3a' and row['vCPU'] <= 3 and row['Memory'] <= 5 :
      return 8
   if row['Instance_Type'] == 'AWS_T3a' and row['vCPU'] <= 3 and row['Memory'] <= 6 :
      return 9
   if row['Instance_Type'] == 'AWS_T3a' and row['vCPU'] > 3 and row['Memory'] > 6 :
      return 10
   if row['Instance_Type'] == 'AWS_T3a' :
      return 10
   if row['Instance_Type'] == 'AWS_T2' and row['vCPU'] == 1 and row['Memory'] == 1 :
      return 1
   if row['Instance_Type'] == 'AWS_T2' and row['vCPU'] == 1 and row['Memory'] <= 2 :
      return 2
   if row['Instance_Type'] == 'AWS_T2' and row['vCPU'] == 1 or row['vCPU'] <= 4 :
      return 7
   if row['Instance_Type'] == 'AWS_T2' and row['vCPU'] <= 2 and row['Memory'] <= 5 :
      return 11
   if row['Instance_Type'] == 'AWS_T2' and row['vCPU'] <= 3 and row['Memory'] <= 6 :
      return 4
   if row['Instance_Type'] == 'AWS_T2' and row['vCPU'] > 3 and row['Memory'] > 6 :
      return 10
   if row['Instance_Type'] == 'AWS_T2' :
      return 10
   if row['Instance_Type'] == 'AWS_M6g' and row['vCPU'] <= 2 and row['Memory'] <= 5 :
      return 6
   if row['Instance_Type'] == 'AWS_M6g' and row['vCPU'] <= 5 and row['Memory'] <= 7 :
      return 3
   if row['Instance_Type'] == 'AWS_M6g' and row['vCPU'] <= 7 and row['Memory'] <= 9 :
      return 7
   if row['Instance_Type'] == 'AWS_M6g' and row['vCPU'] <= 9 and row['Memory'] <= 11 :
      return 12
   if row['Instance_Type'] == 'AWS_M6g' and row['vCPU'] > 9 and row['Memory'] > 11:
      return 13
   if row['Instance_Type'] == 'AWS_M6g':
      return 13  
   if row['Instance_Type'] == 'AWS_M5' and row['vCPU'] <= 3 and row['Memory'] <= 6 :
      return 6
   if row['Instance_Type'] == 'AWS_M5' and row['vCPU'] <= 8  and row['Memory'] <= 7 :
      return 5
   if row['Instance_Type'] == 'AWS_M5' and row['vCPU'] <= 9 and row['Memory'] <= 11 :
      return 12
   if row['Instance_Type'] == 'AWS_M5' and row['vCPU'] > 9 and row['Memory'] > 11 :
      return 14
   if row['Instance_Type'] == 'AWS_M5' :
      return 14
   if row['Instance_Type'] == 'AWS_M5a' and row['vCPU'] <= 2 and row['Memory'] <= 5 :
      return 6
   if row['Instance_Type'] == 'AWS_M5a' and row['vCPU'] <= 7 and row['Memory'] <= 7:
      return 5
   if row['Instance_Type'] == 'AWS_M5a' and row['vCPU'] <= 9 and row['Memory'] <= 11 :
      return 12
   if row['Instance_Type'] == 'AWS_M5a' and row['vCPU'] > 9 and row['Memory'] > 11 :
      return 14
   if row['Instance_Type'] == 'AWS_M5a' :
      return 14
   if row['Instance_Type'] == 'AWS_M4' and row['vCPU'] <= 5 and row['Memory'] <= 7 :
      return 6
   if row['Instance_Type'] == 'AWS_M4' and row['vCPU'] <= 7 and row['Memory'] <= 10 :
      return 5
   if row['Instance_Type'] == 'AWS_M4' and row['vCPU'] <= 9 and row['Memory'] <= 11 :
      return 12
   if row['Instance_Type'] == 'AWS_M4' and row['vCPU'] > 9 and row['Memory'] > 11 :
      return 14
   if row['Instance_Type'] == 'AWS_M4' :
      return 14

def groupedInstance_Type (row):
   if (row['Instance_Type'] == 'AWS_A1') :
      return 1
   if (row['Instance_Type'] == 'AWS_T4g') :
      return 2
   if (row['Instance_Type'] == 'AWS_T3') :
      return 3   
   if (row['Instance_Type'] == 'AWS_T3a') :
      return 4
   if (row['Instance_Type'] == 'AWS_T2') :
      return 5
   if (row['Instance_Type'] == 'AWS_M4') :   
      return 6
   if (row['Instance_Type'] == 'AWS_M6g') :
      return 7
   if (row['Instance_Type'] == 'AWS_M5') :
      return 8
   if (row['Instance_Type'] == 'AWS_M5a') :
      return 9  
   else :
      return 10

instant_types = {
                 'AWS_A1':1,
                 'AWS_T4g':2,
                 'AWS_T3':3,
                 'AWS_T3a':4,
                 'AWS_T2':5,
                 'AWS_M4':6,
                 'AWS_M6g':7,
                 'AWS_M5':8,
                 'AWS_M5a':9
                 }      

     
df1['Network_Performance'] = df1.apply (lambda row: fixNetworkPerformanceIssue(row), axis=1) 
df1['Memory'] = df1.apply (lambda row: groupedMemory(row), axis=1)  
df1['vCPU'] = df1.apply (lambda row: groupedvCPU(row), axis=1)  

df1['Memory'] = df1.apply (lambda row: setMemory(row), axis=1)   
df1['Memory'] = df1['Memory'].astype(int)  

df1['Project_Size'] = df1.apply (lambda row: setProject_Size(row), axis=1)   
df1['Project_Type'] = df1.apply (lambda row: setProjectType(row), axis=1)   
df1['Instance_Type'] = df1.Instance_Type.map (instant_types)

df_dummy['Memory'] = df_dummy.apply (lambda row: groupedMemory(row), axis=1)  
df_dummy['vCPU'] = df_dummy.apply (lambda row: groupedvCPU(row), axis=1) 
df_dummy['Project_Size'] = df_dummy.apply (lambda row: setProject_Size(row), axis=1)   
df_dummy['Instance_Type'] = df_dummy.Instance_Type.map (instant_types)

df1.head(5) 

Unnamed: 0,Instance_Type,Project_Type,Project_Size,vCPU,Memory,Network_Performance
0,6,7,2,3,6,6
1,6,7,2,2,5,4
4,6,7,2,3,5,6
5,9,7,2,4,7,5
8,1,7,2,6,8,5


In [38]:
df2 = df1.loc[df1.Instance_Type.isnull()]
df2.head(50000)

df2 = df1.loc[df1['Instance_Type'] == 8].head(30000)
df4 = df1.loc[df1['Instance_Type'] == 6].head(10000)
df5 = df1.loc[df1['Instance_Type'] == 3].head(30000)
df6 = df1.loc[df1['Instance_Type'] == 5].head(10000)
df8 = df1.loc[df1['Instance_Type'] == 7].head(10000)
df9 = df1.loc[df1['Instance_Type'] == 1].head(10000)
df_general = df2.append([df4 , df5 , df6 , df8 , df9, df_dummy])

In [39]:
df_general = shuffle(df_general)
df_general.reset_index(inplace=True, drop=True)

In [41]:
y = np.asarray(df_general['Instance_Type'])
print(y)

X = df_general.iloc[:,1:].values
X

[3 3 6 ... 6 8 6]


array([[1, 1, 2, 1, 3],
       [1, 1, 2, 1, 3],
       [7, 2, 4, 6, 6],
       ...,
       [7, 2, 2, 5, 4],
       [7, 2, 4, 7, 5],
       [7, 2, 4, 6, 6]])

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [43]:
rbf = svm.SVC(kernel='rbf', gamma='auto', C=1, decision_function_shape='ovo').fit(X_train, y_train)
rbf_pred = rbf.predict(X_test)
print(classification_report(y_test, rbf_pred))
print("Accuracy: " , metrics.accuracy_score(y_test, rbf_pred))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00       422
           2       1.00      0.56      0.72       100
           3       0.99      1.00      0.99      3241
           4       1.00      1.00      1.00       102
           5       1.00      1.00      1.00      2030
           6       1.00      1.00      1.00      1929
           7       1.00      1.00      1.00       648
           8       1.00      1.00      1.00      6013
           9       1.00      1.00      1.00        91

    accuracy                           1.00     14576
   macro avg       1.00      0.95      0.97     14576
weighted avg       1.00      1.00      1.00     14576

Accuracy:  0.9969813391877058


In [44]:
poly = svm.SVC(kernel='poly', degree=5, C=1, decision_function_shape='ovo').fit(X_train, y_train)
poly_pred = poly.predict(X_test)
print(classification_report(y_test, poly_pred))
print("Accuracy: " , metrics.accuracy_score(y_test, poly_pred))

              precision    recall  f1-score   support

           1       1.00      1.00      1.00       422
           2       1.00      0.56      0.72       100
           3       0.98      1.00      0.99      3241
           4       1.00      0.86      0.93       102
           5       1.00      1.00      1.00      2030
           6       1.00      1.00      1.00      1929
           7       1.00      1.00      1.00       648
           8       1.00      1.00      1.00      6013
           9       1.00      1.00      1.00        91

    accuracy                           1.00     14576
   macro avg       1.00      0.94      0.96     14576
weighted avg       1.00      1.00      1.00     14576

Accuracy:  0.9960208562019759


In [45]:
linear = svm.SVC(kernel='linear', C=1, decision_function_shape='ovo').fit(X_train, y_train)
linear_pred = linear.predict(X_test)
print(classification_report(y_test, linear_pred))
print("Accuracy: " , metrics.accuracy_score(y_test, linear_pred))

              precision    recall  f1-score   support

           1       0.99      1.00      0.99       422
           2       1.00      0.38      0.55       100
           3       0.98      1.00      0.99      3241
           4       1.00      1.00      1.00       102
           5       1.00      1.00      1.00      2030
           6       0.64      0.80      0.71      1929
           7       0.69      0.52      0.59       648
           8       0.96      0.90      0.93      6013
           9       0.75      0.86      0.80        91

    accuracy                           0.91     14576
   macro avg       0.89      0.83      0.84     14576
weighted avg       0.92      0.91      0.91     14576

Accuracy:  0.9070389681668496


In [46]:
sig = svm.SVC(kernel='sigmoid', C=1, decision_function_shape='ovo').fit(X_train, y_train)
sig_pred = sig.predict(X_test)
print(classification_report(y_test, sig_pred))
print("Accuracy: " , metrics.accuracy_score(y_test, sig_pred))

              precision    recall  f1-score   support

           1       0.54      0.76      0.63       422
           2       0.00      0.00      0.00       100
           3       0.25      0.26      0.26      3241
           4       0.00      0.00      0.00       102
           5       1.00      0.43      0.60      2030
           6       0.55      0.23      0.32      1929
           7       0.00      0.00      0.00       648
           8       0.52      0.58      0.54      6013
           9       0.00      0.00      0.00        91

    accuracy                           0.41     14576
   macro avg       0.32      0.25      0.26     14576
weighted avg       0.50      0.41      0.43     14576

Accuracy:  0.40772502744237105


  _warn_prf(average, modifier, msg_start, len(result))


According to result, we can get best result for RBF kernal of SVM algorithm. 

In [47]:
general_predict_model = rbf

# Compute Optimized Family Type

In [51]:
compute_predict_model = rbf

# Memory Optimized Family Type

In [52]:
memory_predict_model = rbf

# Accelerated Computing Family Type

In [53]:
accelerated_predict_model = rbf

# Storage Optimized Family Type

In [54]:
storage_predict_model = rbf

According to result, we calculate all Instance family prediction models,
Next we calculate what is intance type related result

# Instance Type Selection Model

In [55]:
x2 = [[14, 3, 12, 13,10]]
if(family_type == 1):
  cus_pred2 = compute_predict_model.predict(x2)
if(family_type == 2):
  cus_pred2 = general_predict_model.predict(x2)
if(family_type == 3):
  cus_pred2 = memory_predict_model.predict(x2)
if(family_type == 4):
  cus_pred2 = accelerated_predict_model.predict(x2)
if(family_type == 5):
  cus_pred2 = storage_predict_model.predict(x2)        

instance_Type = cus_pred2[0]
instance_Type

8

Next, we want to find how much price want to spend for relevent Intance type according to user requirements. So, I divid 5 pricing models to predict this for each Instance family.

# General Purpose Pricing Model

In [57]:
df = df_general_table.copy()

NameError: ignored

In [None]:
df["Memory"] = df["Memory"].astype("float64")
df["EffectiveDates"] = df["EffectiveDates"].astype("datetime64")
df["Project_Type"] = df["Project_Type"].astype(str)
df["Processor_Archi"] = df["Storage"].astype(str)
df["Clock_Speed"] = df["Clock_Speed"].astype(str)
df["PricePerUnit"] = df["PricePerUnit"].astype(float)
columns = ['PricePerUnit', 'UpfrontCost', 'Instance_Type', 'Operating_System', 'Location', 'LeaseContractLength', 'PurchaseOption', 'OfferingClass']
df1 = df.reindex(columns=columns)
df1 = df1[columns]
df1.head()

In [None]:
def setOnDemandPrice (row):
    if (row['Instance_Type'] == 'a1.medium') :
      return 0.0255
    if (row['Instance_Type'] == 'a1.large') :
      return 0.051
    if (row['Instance_Type'] == 'a1.xlarge') :
      return 0.102
    if (row['Instance_Type'] == 'a1.2xlarge') :
      return 0.204  
    if (row['Instance_Type'] == 'a1.4xlarge') :
      return 0.408
    if (row['Instance_Type'] == 'a1.metal') :
      return 0.408  
    if (row['Instance_Type'] == 't4g.nano') :
      return 0.0042  
    if (row['Instance_Type'] == 't4g.micro') :
      return 0.0084         
    if (row['Instance_Type'] == 't4g.small') :
      return 0.0168
    if (row['Instance_Type'] == 't4g.medium') :
      return 0.0336        
    if (row['Instance_Type'] == 't4g.large') :
      return 0.0672 
    if (row['Instance_Type'] == 't4g.xlarge') :
      return 0.1344       
    if (row['Instance_Type'] == 't4g.2xlarge') :
      return 0.2688  

    if (row['Instance_Type'] == 't3.nano') :
      return 0.0052  
    if (row['Instance_Type'] == 't3.micro') :
      return 0.0104         
    if (row['Instance_Type'] == 't3.small') :
      return 0.0208
    if (row['Instance_Type'] == 't3.medium') :
      return 0.0416        
    if (row['Instance_Type'] == 't3.large') :
      return 0.0832 
    if (row['Instance_Type'] == 't3.xlarge') :
      return 0.1664       
    if (row['Instance_Type'] == 't3.2xlarge') :
      return 0.3328 

    if (row['Instance_Type'] == 't3a.nano') :
      return 0.0047  
    if (row['Instance_Type'] == 't3a.micro') :
      return 0.0094         
    if (row['Instance_Type'] == 't3a.small') :
      return 0.0188
    if (row['Instance_Type'] == 't3a.medium') :
      return 0.0376        
    if (row['Instance_Type'] == 't3a.large') :
      return 0.0752 
    if (row['Instance_Type'] == 't3a.xlarge') :
      return 0.1504       
    if (row['Instance_Type'] == 't3a.2xlarge') :
      return 0.3008  

    if (row['Instance_Type'] == 'm6g.medium') :
      return 0.0385        
    if (row['Instance_Type'] == 'm6g.large') :
      return 0.077 
    if (row['Instance_Type'] == 'm6g.xlarge') :
      return 0.154       
    if (row['Instance_Type'] == 'm6g.2xlarge') :
      return 0.308
    if (row['Instance_Type'] == 'm6g.4xlarge') :
      return 0.616
    if (row['Instance_Type'] == 'm6g.8xlarge') :
      return 1.232
    if (row['Instance_Type'] == 'm6g.12xlarge') :
      return 1.848        
    if (row['Instance_Type'] == 'm6g.16xlarge') :
      return 2.464    
    if (row['Instance_Type'] == 'm6g.metal') :
      return 2.464    

    if (row['Instance_Type'] == 'm6gd.medium') :
      return 0.0452       
    if (row['Instance_Type'] == 'm6gd.large') :
      return 0.0904
    if (row['Instance_Type'] == 'm6gd.xlarge') :
      return 0.1808     
    if (row['Instance_Type'] == 'm6gd.2xlarge') :
      return 0.3616
    if (row['Instance_Type'] == 'm6gd.4xlarge') :
      return 0.7232
    if (row['Instance_Type'] == 'm6gd.8xlarge') :
      return 1.4464
    if (row['Instance_Type'] == 'm6gd.12xlarge') :
      return 2.1696       
    if (row['Instance_Type'] == 'm6gd.16xlarge') :
      return 2.8928   
    if (row['Instance_Type'] == 'm6gd.metal') :
      return 2.8928  

    if (row['Instance_Type'] == 'm5.large') :
      return 0.0906
    if (row['Instance_Type'] == 'm5.xlarge') :
      return 0.192    
    if (row['Instance_Type'] == 'm5.2xlarge') :
      return 0.384
    if (row['Instance_Type'] == 'm5.4xlarge') :
      return 0.768
    if (row['Instance_Type'] == 'm5.8xlarge') :
      return 1.536
    if (row['Instance_Type'] == 'm5.12xlarge') :
      return 2.304       
    if (row['Instance_Type'] == 'm5.16xlarge') :
      return 3.072  
    if (row['Instance_Type'] == 'm5.24xlarge') :
      return 4.608    
    if (row['Instance_Type'] == 'm5.metal') :
      return 4.608    

    if (row['Instance_Type'] == 'm5d.large') :
      return 0.113
    if (row['Instance_Type'] == 'm5d.xlarge') :
      return 0.226    
    if (row['Instance_Type'] == 'm5d.2xlarge') :
      return 0.452
    if (row['Instance_Type'] == 'm5d.4xlarge') :
      return 0.904
    if (row['Instance_Type'] == 'm5d.8xlarge') :
      return 1.808
    if (row['Instance_Type'] == 'm5d.12xlarge') :
      return 2.712       
    if (row['Instance_Type'] == 'm5d.16xlarge') :
      return 3.616 
    if (row['Instance_Type'] == 'm5d.24xlarge') :
      return 5.424    
    if (row['Instance_Type'] == 'm5d.metal') :
      return 5.424  

    if (row['Instance_Type'] == 'm5a.large') :
      return 0.086
    if (row['Instance_Type'] == 'm5a.xlarge') :
      return 0.172    
    if (row['Instance_Type'] == 'm5a.2xlarge') :
      return 0.688
    if (row['Instance_Type'] == 'm5a.4xlarge') :
      return 0.688
    if (row['Instance_Type'] == 'm5a.8xlarge') :
      return 1.376

    if (row['Instance_Type'] == 'm5ad.large') :
      return 0.103
    if (row['Instance_Type'] == 'm5ad.xlarge') :
      return 0.206    
    if (row['Instance_Type'] == 'm5ad.2xlarge') :
      return 0.412
    if (row['Instance_Type'] == 'm5ad.4xlarge') :
      return 0.824
    if (row['Instance_Type'] == 'm5ad.12xlarge') :
      return 2.472      
    if (row['Instance_Type'] == 'm5ad.24xlarge') :
      return 4.944    

    if (row['Instance_Type'] == 'm5n.large') :
      return 0.119
    if (row['Instance_Type'] == 'm5n.xlarge') :
      return 0.238    
    if (row['Instance_Type'] == 'm5n.2xlarge') :
      return 0.476
    if (row['Instance_Type'] == 'm5n.4xlarge') :
      return 0.952
    if (row['Instance_Type'] == 'm5n.8xlarge') :
      return 1.904
    if (row['Instance_Type'] == 'm5n.12xlarge') :
      return 2.856       
    if (row['Instance_Type'] == 'm5n.16xlarge') :
      return 3.808
    if (row['Instance_Type'] == 'm5n.24xlarge') :
      return 5.712

    if (row['Instance_Type'] == 'm4.large') :
      return 0.1
    if (row['Instance_Type'] == 'm4.xlarge') :
      return 0.2   
    if (row['Instance_Type'] == 'm4.2xlarge') :
      return 0.4
    if (row['Instance_Type'] == 'm4.4xlarge') :
      return 0.8
    if (row['Instance_Type'] == 'm4.10xlarge') :
      return 2   
    if (row['Instance_Type'] == 'm4.16xlarge') :
      return 3.2 
    else :
      return 0.3008    
                          

              

def setInstanceType (row):
    if (row['Instance_Type'] == 'a1.medium') :
        return 10
    if (row['Instance_Type'] == 'a1.large') :
        return 15
    if (row['Instance_Type'] == 'a1.xlarge') :
        return 24
    if (row['Instance_Type'] == 'a1.2xlarge') :
        return 37  
    if (row['Instance_Type'] == 'a1.4xlarge') :
        return 46
    if (row['Instance_Type'] == 'a1.metal') :
        return 47  
    if (row['Instance_Type'] == 't4g.nano') :
        return 1 
    if (row['Instance_Type'] == 't4g.micro') :
        return 4        
    if (row['Instance_Type'] == 't4g.small') :
        return 7
    if (row['Instance_Type'] == 't4g.medium') :
        return 11       
    if (row['Instance_Type'] == 't4g.large') :
        return 16
    if (row['Instance_Type'] == 't4g.xlarge') :
        return 31     
    if (row['Instance_Type'] == 't4g.2xlarge') :
        return 42

    if (row['Instance_Type'] == 't3.nano') :
        return 3 
    if (row['Instance_Type'] == 't3.micro') :
        return 6         
    if (row['Instance_Type'] == 't3.small') :
        return 9
    if (row['Instance_Type'] == 't3.medium') :
        return 13        
    if (row['Instance_Type'] == 't3.large') :
      return 19
    if (row['Instance_Type'] == 't3.xlarge') :
      return 30    
    if (row['Instance_Type'] == 't3.2xlarge') :
      return 44 

    if (row['Instance_Type'] == 't3a.nano') :
      return 36 
    if (row['Instance_Type'] == 't3a.micro') :
      return 5         
    if (row['Instance_Type'] == 't3a.small') :
      return 8
    if (row['Instance_Type'] == 't3a.medium') :
      return 12       
    if (row['Instance_Type'] == 't3a.large') :
      return 17
    if (row['Instance_Type'] == 't3a.xlarge') :
      return 32     
    if (row['Instance_Type'] == 't3a.2xlarge') :
      return 43  

    if (row['Instance_Type'] == 'm6g.medium') :
      return 14        
    if (row['Instance_Type'] == 'm6g.large') :
      return 18
    if (row['Instance_Type'] == 'm6g.xlarge') :
      return 29     
    if (row['Instance_Type'] == 'm6g.2xlarge') :
      return 46
    if (row['Instance_Type'] == 'm6g.4xlarge') :
      return 50
    if (row['Instance_Type'] == 'm6g.8xlarge') :
      return 60
    if (row['Instance_Type'] == 'm6g.12xlarge') :
      return 63       
    if (row['Instance_Type'] == 'm6g.16xlarge') :
      return 69   
    if (row['Instance_Type'] == 'm6g.metal') :
      return 70 

    if (row['Instance_Type'] == 'm6gd.medium') :
      return 15     
    if (row['Instance_Type'] == 'm6gd.large') :
      return 22
    if (row['Instance_Type'] == 'm6gd.xlarge') :
      return 33   
    if (row['Instance_Type'] == 'm6gd.2xlarge') :
      return 49
    if (row['Instance_Type'] == 'm6gd.4xlarge') :
      return 55
    if (row['Instance_Type'] == 'm6gd.8xlarge') :
      return 62
    if (row['Instance_Type'] == 'm6gd.12xlarge') :
      return 67     
    if (row['Instance_Type'] == 'm6gd.16xlarge') :
      return 73   
    if (row['Instance_Type'] == 'm6gd.metal') :
      return 74

    if (row['Instance_Type'] == 'm5.large') :
      return 21
    if (row['Instance_Type'] == 'm5.xlarge') :
      return 28
    if (row['Instance_Type'] == 'm5.2xlarge') :
      return 47
    if (row['Instance_Type'] == 'm5.4xlarge') :
      return 54
    if (row['Instance_Type'] == 'm5.8xlarge') :
      return 64
    if (row['Instance_Type'] == 'm5.12xlarge') :
      return 68      
    if (row['Instance_Type'] == 'm5.16xlarge') :
      return 76  
    if (row['Instance_Type'] == 'm5.24xlarge') :
      return 80   
    if (row['Instance_Type'] == 'm5.metal') :
      return 81   

    if (row['Instance_Type'] == 'm5d.large') :
      return 25
    if (row['Instance_Type'] == 'm5d.xlarge') :
      return 38    
    if (row['Instance_Type'] == 'm5d.2xlarge') :
      return 48
    if (row['Instance_Type'] == 'm5d.4xlarge') :
      return 56
    if (row['Instance_Type'] == 'm5d.8xlarge') :
      return 64
    if (row['Instance_Type'] == 'm5d.12xlarge') :
      return 72      
    if (row['Instance_Type'] == 'm5d.16xlarge') :
      return 78 
    if (row['Instance_Type'] == 'm5d.24xlarge') :
      return 83    
    if (row['Instance_Type'] == 'm5d.metal') :
      return 84 

    if (row['Instance_Type'] == 'm5a.large') :
      return 22
    if (row['Instance_Type'] == 'm5a.xlarge') :
      return 34  
    if (row['Instance_Type'] == 'm5a.2xlarge') :
      return 52
    if (row['Instance_Type'] == 'm5a.4xlarge') :
      return 53
    if (row['Instance_Type'] == 'm5a.8xlarge') :
      return 61

    if (row['Instance_Type'] == 'm5ad.large') :
      return 26
    if (row['Instance_Type'] == 'm5ad.xlarge') :
      return 39    
    if (row['Instance_Type'] == 'm5ad.2xlarge') :
      return 47
    if (row['Instance_Type'] == 'm5ad.4xlarge') :
      return 58
    if (row['Instance_Type'] == 'm5ad.12xlarge') :
      return 71     
    if (row['Instance_Type'] == 'm5ad.24xlarge') :
      return 82   

    if (row['Instance_Type'] == 'm5n.large') :
      return 27
    if (row['Instance_Type'] == 'm5n.xlarge') :
      return 41   
    if (row['Instance_Type'] == 'm5n.2xlarge') :
      return 51
    if (row['Instance_Type'] == 'm5n.4xlarge') :
      return 59
    if (row['Instance_Type'] == 'm5n.8xlarge') :
      return 65
    if (row['Instance_Type'] == 'm5n.12xlarge') :
      return 75      
    if (row['Instance_Type'] == 'm5n.16xlarge') :
      return 79
    if (row['Instance_Type'] == 'm5n.24xlarge') :
      return 85

    if (row['Instance_Type'] == 'm4.large') :
      return 23
    if (row['Instance_Type'] == 'm4.xlarge') :
      return 35  
    if (row['Instance_Type'] == 'm4.2xlarge') :
      return 45
    if (row['Instance_Type'] == 'm4.4xlarge') :
      return 55
    if (row['Instance_Type'] == 'm4.10xlarge') :
      return 66   
    if (row['Instance_Type'] == 'm4.16xlarge') :
      return 77 
    else :
      return 43

os_types = {
                 'Windows':2,
                 'Linux':1,
                 'SUSE':3,
                 'RHEL':4
                 }  
                       

location_type = {
      'Asia Pacific (Mumbai)':1,    
      'US East (Ohio)':2,     
      'US East (N. Virginia)':3, 
      'US West (Oregon)':4,
      'EU (Ireland)':5,
      'EU (Frankfurt)':6,
      'Canada (Central)':7,
      'EU (London)':9,
      'EU (Paris)':8,
      'EU (Stockholm)':10,
      'Asia Pacific (Tokyo)':11,   
      'AWS GovCloud (US-East)':12,
      'AWS GovCloud (US)':13,
      'US West (N. California)':14,
      'Asia Pacific (Seoul)':15,  
      'Asia Pacific (Singapore)':16,
      'Asia Pacific (Sydney)':17,
      'EU (Milan)':18,
      'Middle East (Bahrain)':20,
      'Asia Pacific (Cape Town)':19,
      'Asia Pacific (Osaka-Local)':21,
      'Asia Pacific (Hong Kong)':22,
      'South America (Sao Paulo)':23          
}     
  
purchase_type = {
    'NoUpfront':3,            
    'PartialUpfront':2,        
    'AllUpfront':1            
}

offeringClass = {
    'standard':1,
    'convertible':2
}

leaseContractLength = {
    1:1,
    3:2
}

clockSpeed = {
    'Up to 3.3 GHz':1,
    'Up to 3.0 GHz':2,      
    '2.3 GHz':3,  
    '2.4 GHz': 4,               
    '2.5 GHz':5         
}

year = {
    2016:1,
    2017:2,      
    2018:3,  
    2019:4,               
    2020:5         
}


def setOS (row):
       if (row['Operating_System'] == None) :
          return 'Windows'
       else :
          return row['Operating_System']       

def setLocation (row):
       if (row['Location'] == None) :
          return 'US West (Oregon)'
       else :
          return row['Location']                        


def setPurchaseOption (row):
       if (row['PurchaseOption'] == None) :
          return 'AllUpfront'
       if (row['PurchaseOption'] == 'No Upfront') :
          return 'NoUpfront'
       if (row['PurchaseOption'] == 'Partial Upfront') :
          return 'PartialUpfront'
       if (row['PurchaseOption'] == 'All Upfront') :
          return 'AllUpfront'      
       else :
          return row['PurchaseOption']  

def setOfferingClass (row):
       if (row['OfferingClass'] == None) :
          return 'standard'
       else :
          return row['OfferingClass']       

def setLeaseContractLength (row):
       if (row['LeaseContractLength'] == None) :
          return 3
       else :
          return row['LeaseContractLength']    


df1['Operating_System'] = df1.apply (lambda row: setOS(row), axis=1)
df1['Operating_System'] = df1.Operating_System.map (os_types)

df1['Location'] = df1.apply (lambda row: setLocation(row), axis=1)
df1['Location'] = df1.Location.map (location_type)
df1['Location'] = df1['Location'].fillna(4).astype(np.int64)

df1['Instance_Type'] = df1.apply (lambda row: setInstanceType(row), axis=1)

df1['PurchaseOption'] = df1.apply (lambda row: setPurchaseOption(row), axis=1)
df1['PurchaseOption'] = df1.PurchaseOption.map (purchase_type)

df1['OfferingClass'] = df1.apply (lambda row: setOfferingClass(row), axis=1)
df1['OfferingClass'] = df1.OfferingClass.map (offeringClass)

df1['LeaseContractLength'] = df1.apply (lambda row: setLeaseContractLength(row), axis=1)
df1['LeaseContractLength'] = df1.LeaseContractLength.map (leaseContractLength)

# df1['Year'] = df1.Year.map (year)
df1['Location'] = df1['Location'].fillna(4).astype(np.int64)

df1.head(5) 

In [None]:
def setOSPrice(x):
       if (x == 1) :
          return 100
       if (x == 2) :
          return 114.64 
       if (x == 3) :
          return 193
       if (x == 4) :
          return 166.67        
       else :
          return 100 

def setOfferingClassPrice(OfferingClass, price) :
  if(OfferingClass == 2):
    return price*114.77/100
  else:
    return price

def setStoragePriceWithLocation(x):
  if(x == 1 or x == 15):
    return 114
  if(x == 2):
    return 100
  if(x == 3 or x == 4):
    return 80
  if(x == 5):
    return 88  
  if(x == 6 or x == 17):
    return 119
  if(x == 7):
    return 110
  if(x == 8):
    return 107
  if(x == 9):
    return 92.8
  if(x == 10):
    return 104.5
  if(x == 11 or x == 14 or x == 16):
    return 120 
  if(x == 12 or x == 13 or x == 21):
    return 96
  if(x == 18):
    return 92.4
  if(x == 19):
    return 126.4
  if(x == 20):
    return 130.9
  if(x == 22):
    return 132
  if(x == 23):
    return 152

def setLocationPrice(x):
  if(x == 1):
    return 66.67
  if(x == 2 or x == 3 or x == 4):
    return 100
  if(x == 5):
    return 109.52
  if(x == 6):
    return 114.29  
  if(x == 7 or x == 8):
    return 124.26
  if(x == 9):
    return 126.41
  if(x == 10 or x == 11):
    return 128.57
  if(x == 12 or x == 13):
    return 130.73
  if(x == 14):
    return 132.89
  if(x == 15):
    return 139.29 
  if(x == 16):
    return 140.47
  if(x == 17):
    return 141.44
  if(x == 18 or x == 19):
    return 142.63
  if(x == 20):
    return 149.26
  if(x == 21):
    return 161.90
  if(x == 22):
    return 173.81
  if(x == 23):
    return 179.99
  else:  
    return 100  

def getHourlyCostWithPurchaseType(purchase_type,leaseContractLength, onDemandPrice):
  NumHours = 365*24*leaseContractLength   #365 days * 24 hours * 1 year = 8760.0000 hours
  if(leaseContractLength == 1):
    y = onDemandPrice*62.73/100
  if(leaseContractLength == 2):
    y = onDemandPrice*43.15/100
  TotalCommitment = y * NumHours
  x = 0
  if(purchase_type == 2):
   x = 0.5
  if(purchase_type == 1):
   x = 1 
  UpfrontCost = x*TotalCommitment
  HourlyCost = (TotalCommitment - UpfrontCost)/NumHours
  return HourlyCost

def getUpfrontCostWithPurchaseType(purchase_type,leaseContractLength, onDemandPrice):
  NumHours = 365*24*leaseContractLength  
  if(leaseContractLength == 1):
    y = onDemandPrice*62.73/100
  if(leaseContractLength == 2):
    y = onDemandPrice*43.15/100
  TotalCommitment = y * NumHours
  x = 0
  if(purchase_type == 2):
   x = 0.5
  if(purchase_type == 1):
   x = 1 
  UpfrontCost = x*TotalCommitment
  return UpfrontCost

def setPricingEC2(row):
  # if (row['PricePerUnit'] <= 5 or row['PricePerUnit'] > 38000):
    p = setOnDemandPrice(row)
    p = getHourlyCostWithPurchaseType(row['PurchaseOption'], row['LeaseContractLength'], p)
    p = setOfferingClassPrice(row['OfferingClass'], p)
    p = p*setLocationPrice(row['Location'])/100
    p = p*setOSPrice(row['Operating_System'])/100
    return p*730 + 10*setStoragePriceWithLocation(row['Location'])/100
  # else :
  #   return row['PricePerUnit']

def setPricingUpfrontEC2(row):
    p = setOnDemandPrice(row)
    p = getUpfrontCostWithPurchaseType(row['PurchaseOption'], row['LeaseContractLength'], p)
    p = setOfferingClassPrice(row['OfferingClass'], p)
    p = p*setLocationPrice(row['Location'])/100
    p = p*setOSPrice(row['Operating_System'])/100
    return p

df1['PricePerUnit'] = df1.apply (lambda row: setPricingEC2(row), axis=1) 
df1['UpfrontCost'] = df1.apply (lambda row: setPricingUpfrontEC2(row), axis=1) 

df1['PricePerUnit'] = df1['PricePerUnit'].apply(lambda x: round(x, 2))
df1['UpfrontCost'] = df1['UpfrontCost'].apply(lambda x: round(x, 2))

df1.head(5) 

In [None]:
df2 = df1.copy()
df2['PricePerUnit'] = df2['PricePerUnit']/100
df2['UpfrontCost'] = df2['UpfrontCost']/1000
df2

In [None]:
df3 = df2.loc[df2['Instance_Type'] == 43].head(15000)
df4 = df2.loc[df2['Instance_Type'] != 43].head(200000)
df5 = df3.append([df4])
X = df5.drop(columns=['PricePerUnit', 'UpfrontCost'])
y = df5[['PricePerUnit']]
X

In [None]:
regr = linear_model.LinearRegression()
regr.fit(X, y)

In [None]:
general_purpose_price_model = regr

# Compute Optimized price model

In [None]:
compute_optimized_price_model = regr

# Memory Optimized price model

In [None]:
memory_optimized_price_model = regr

# Accelerated Computing price model

In [None]:
acelerated_computing_price_model = regr

# Storage Optimized price model

In [None]:
storage_optimized_price_model = regr

In [None]:
# prediction with sklearn
New_Instance_Type = instance_Type
New_Operating_System = 1
New_Location = 2
New_OfferingClass = 2
New_LeaseContractLength = 2
New_PurchaseOption = 2

print ('Predicted Stock Index Price: \n', general_purpose_price_model.predict([[New_Instance_Type ,New_Operating_System, New_Location, New_LeaseContractLength, New_PurchaseOption, New_OfferingClass]]))

# with statsmodels
X = sm.add_constant(X) # adding a constant
 
model = sm.OLS(y, X).fit()
predictions = model.predict(X) 
 
print_model = model.summary()
print(print_model)