# Importing Data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = pd.read_excel(r'/content/drive/MyDrive/prads_assignment/data_train.xlsx')

In [None]:
df_test = pd.read_excel(r'/content/drive/MyDrive/prads_assignment/data_testUNLABELED.xlsx')

In [None]:
df = df.dropna()

# Defining Error Rate Function

In [None]:
def balanced_error_rate(y_pred, y_actual):
  unique, counts = numpy.unique(y_sample_test, return_counts=True)
  totalActualValues = dict(zip(unique, counts))
  predictedValues = {
      '0':{'0':0, '1': 0, '2':0},
      '1':{'0':0, '1': 0, '2':0},
      '2':{'0':0, '1': 0, '2':0}
  }
  for i in range(len(y_pred)):
    predictedValues[str(y_actual[i])][str(y_pred[i])]+=1
  errorSumForRates = {
      '0':0,
      '1':0,
      '2':0
  }
  for key in predictedValues.keys():
    for keyLevel2 in predictedValues[key].keys():
      if (keyLevel2!=key):
        errorSumForRates[str(key)]+=predictedValues[key][keyLevel2]
  sumBalancedError = 0
  errorRateOnEachClass = {
      '0':0,
      '1':0,
      '2':0
  }
  for key in errorSumForRates.keys():
    sumBalancedError += (errorSumForRates[str(key)]/totalActualValues[int(key)])
    errorRateOnEachClass[str(key)] = (errorSumForRates[str(key)]/totalActualValues[int(key)])
  return sumBalancedError/len(errorSumForRates.keys()), errorRateOnEachClass, predictedValues

# Baseline Method
## Predicting values for the non-preprocessed data

### Splitting the data

In [None]:
X = df.drop('y', axis = 1)
y = df['y']

X_train, X_sample_test, y_train, y_sample_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Fitting training data to a basic Random Forest model

In [None]:
model = RandomForestClassifier(n_estimators= 10)
model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=10)

### Predicting the result for the sample test data

In [None]:
y_pred_sample_test = model.predict(X_sample_test)

### Finding the balanced error rate

In [None]:
total_balanced_error_rate, errorRateOnEachClass, predictedValues = balanced_error_rate(y_pred_sample_test.tolist(),y_sample_test.values.tolist()) 

In [None]:
print("\n{:<15} {:<15} {:<15} {:<15} {:<15}\n".format('Actual', 'Predicted 0', 'Predicted 1', 'Predicted 2', 'Error Rate'))
 
for keyUp, valueUP in predictedValues.items():
  listOutput = []
  for key, value in predictedValues[keyUp].items():
    listOutput.append(value)
  pred0, pred1, pred2 = listOutput[0], listOutput[1],listOutput[2]
  print("{:<15} {:<15} {:<15} {:<15} {:<15}".format('Class '+str(keyUp), pred0, pred1, pred2,errorRateOnEachClass[str(keyUp)]))

print("\n\nBalanced Error Rate : " + str(total_balanced_error_rate))


Actual          Predicted 0     Predicted 1     Predicted 2     Error Rate     

Class 0         122             6               99              0.46255506607929514
Class 1         23              57              37              0.5128205128205128
Class 2         60              13              209             0.25886524822695034


Overall Error Rate : 0.4114136090422528


### Classification report for the rest of the metrics

In [None]:
from sklearn.metrics import classification_report
target_names = ['class 0', 'class 1', 'class 2']
print(classification_report(y_pred_sample_test.tolist(),y_sample_test.values.tolist(), target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.54      0.60      0.56       205
     class 1       0.49      0.75      0.59        76
     class 2       0.74      0.61      0.67       345

    accuracy                           0.62       626
   macro avg       0.59      0.65      0.61       626
weighted avg       0.64      0.62      0.62       626



# Random Forest Method
## Predicting values for the preprocessed data

###Normalizing columns to same range



In [None]:
df_min_max_scaled = df.copy()
for key in df.keys():
  if key!= 'y':
    column = key
    df_min_max_scaled[column] = (df_min_max_scaled[column] - df_min_max_scaled[column].min()) / (df_min_max_scaled[column].max() - df_min_max_scaled[column].min())    

df_min_max_scaled = df_min_max_scaled.fillna(0)

### Splitting the data

In [None]:
X = df_min_max_scaled.drop('y', axis = 1)
y = df_min_max_scaled['y']

In [None]:
X_train, X_sample_test, y_train, y_sample_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Comparing the ratios of labels in the split dataset

In [None]:
unique, counts = numpy.unique(y_train, return_counts=True)
totalActualValues = dict(zip(unique, counts))
countTrain = {
      '0':0,
      '1':0,
      '2':0
  }
for i in range(len(y_train.tolist())):
  countTrain[str(y_train.tolist()[i])]+=1

print(countTrain)

for key in countTrain.keys():
  print(key,"\t",countTrain[key]/len(y_train))

unique, counts = numpy.unique(y_sample_test, return_counts=True)
totalActualValues = dict(zip(unique, counts))

countTest = {
      '0':0,
      '1':0,
      '2':0
  }
for i in range(len(y_sample_test.tolist())):
  countTest[str(y_sample_test.tolist()[i])]+=1

print(countTest)


for key in countTest.keys():
  print(key,"\t",countTest[key]/len(y_sample_test))

{'0': 829, '1': 402, '2': 1272}
0 	 0.331202556931682
1 	 0.16060727127447064
2 	 0.5081901717938474
{'0': 227, '1': 117, '2': 282}
0 	 0.36261980830670926
1 	 0.1869009584664537
2 	 0.4504792332268371


### Fine-tuning the Random Forest algorithm with the help of Random Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

n_estimators = [int(x) for x in np.linspace(start = 50, stop = 2000, num = 10)]

max_features = ['auto', 'sqrt']

max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

min_samples_split = [2, 5, 10]

min_samples_leaf = [1, 2, 4]

bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [50, 266, 483, 700, 916, 1133, 1350, 1566, 1783, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state = 42)
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                              n_iter = 100, scoring='neg_mean_absolute_error', 
                              cv = 3, verbose=2, random_state=42, n_jobs=-1,
                              return_train_score=True)
rf_random.fit(X_train, y_train)

"""
Fitting 3 folds for each of 100 candidates, totalling 300 fits
RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(random_state=42),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [50, 266, 483, 700, 916,
                                                         1133, 1350, 1566, 1783,
                                                         2000]},
                   random_state=42, return_train_score=True,
                   scoring='neg_mean_absolute_error', verbose=2)
"""

In [None]:
# rf_random.best_params_
"""
{'n_estimators': 1783,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 30,
 'bootstrap': False}
"""

### Fitting training data to the fine-tuned Random Forest model

In [None]:
model = RandomForestClassifier(n_estimators= 1783,
 min_samples_split= 10,
 min_samples_leaf= 1,
 max_features= 'sqrt',
 max_depth= 30,
 bootstrap= False)


model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=False, max_depth=30, max_features='sqrt',
                       min_samples_split=10, n_estimators=1783)

### Predicting the result for the sample test data

In [None]:
y_pred_sample_test = model.predict(X_sample_test)

### Finding the balanced error rate

In [None]:
total_balanced_error_rate, errorRateOnEachClass, predictedValues = balanced_error_rate(y_pred_sample_test.tolist(),y_sample_test.values.tolist()) 

In [None]:
print("\n{:<15} {:<15} {:<15} {:<15} {:<15}\n".format('Actual', 'Predicted 0', 'Predicted 1', 'Predicted 2', 'Error Rate'))
 
for keyUp, valueUP in predictedValues.items():
  listOutput = []
  for key, value in predictedValues[keyUp].items():
    listOutput.append(value)
  pred0, pred1, pred2 = listOutput[0], listOutput[1],listOutput[2]
  print("{:<15} {:<15} {:<15} {:<15} {:<15}".format('Class '+str(keyUp), pred0, pred1, pred2,errorRateOnEachClass[str(keyUp)]))

print("\n\nOverall Error Rate : " + str(total_balanced_error_rate))


Actual          Predicted 0     Predicted 1     Predicted 2     Error Rate     

Class 0         126             6               95              0.44493392070484583
Class 1         15              60              42              0.48717948717948717
Class 2         58              8               216             0.23404255319148937


Overall Error Rate : 0.38871865369194075


### Classification report for the rest of the metrics

In [None]:
from sklearn.metrics import classification_report
target_names = ['class 0', 'class 1', 'class 2']
print(classification_report(y_pred_sample_test.tolist(),y_sample_test.values.tolist(), target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.56      0.63      0.59       199
     class 1       0.51      0.81      0.63        74
     class 2       0.77      0.61      0.68       353

    accuracy                           0.64       626
   macro avg       0.61      0.69      0.63       626
weighted avg       0.67      0.64      0.65       626



## Predicting labels for the actual test data

In [None]:
model = RandomForestClassifier(n_estimators= 1783,
 min_samples_split= 10,
 min_samples_leaf= 1,
 max_features= 'sqrt',
 max_depth= 30,
 bootstrap= False)


model.fit(X, y)

RandomForestClassifier(bootstrap=False, max_depth=30, max_features='sqrt',
                       min_samples_split=10, n_estimators=1783)

In [None]:

df_test2 = df_test.drop(['index','y'], axis = 1)



df_min_max_scaled_test = df_test2.copy()
for key in df_test2.keys():
  if key!= 'y':
    column = key
    df_min_max_scaled_test[column] = (df_min_max_scaled_test[column] - df_min_max_scaled_test[column].min()) / (df_min_max_scaled_test[column].max() - df_min_max_scaled_test[column].min())    


df_min_max_scaled_test = df_min_max_scaled_test.fillna(0)



X_test = df_min_max_scaled_test

In [None]:
y_pred = model.predict(X_test)

In [None]:
df_min_max_scaled_test['y'] = y_pred

df_min_max_scaled_test.to_csv("output_random_Forest.csv", columns = ['y'])

# Neural Network

### Splitting the data

In [None]:
X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping

### Preprocessing the input to be given to the neural network model

In [None]:

X_train = np.array(X_train)
encoder = LabelEncoder()
encoder.fit(y_train2)
encoded_Y = encoder.transform(y_train2)
dummy_y = np_utils.to_categorical(encoded_Y)



X_val = np.array(X_val)
encoder_val= LabelEncoder()
encoder_val.fit(y_val)
encoded_Y_val = encoder_val.transform(y_val)
dummy_y_val = np_utils.to_categorical(encoded_Y_val)




X_sample_test = np.array(X_sample_test)
encoder_sample_test = LabelEncoder()
encoder_sample_test.fit(y_sample_test)
encoded_Y_sample_test = encoder_sample_test.transform(y_sample_test)
dummy_y_sample_test = np_utils.to_categorical(encoded_Y_sample_test)

### A simple neural network module

In [None]:
import tensorflow
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization


model = Sequential()
model.add(Dense(100, input_shape=(X_train.shape[1],), activation='relu')) 
model.add(Dense(50, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X_train2, dummy_y, epochs=15, batch_size=10, validation_data=(X_val, dummy_y_val))


prediction = model.predict(X_sample_test)

from sklearn.metrics import classification_report
target_names = ['class 0', 'class 1', 'class 2']
print(classification_report(dummy_y_sample_test.argmax(axis=1), prediction.argmax(axis=1)))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
              precision    recall  f1-score   support

           0       0.59      0.44      0.50       227
           1       0.66      0.49      0.56       117
           2       0.56      0.73      0.63       282

    accuracy                           0.58       626
   macro avg       0.60      0.55      0.56       626
weighted avg       0.59      0.58      0.57       626



### Predicting the result for the sample test data

In [None]:
total_balanced_error_rate, errorRateOnEachClass, predictedValues = balanced_error_rate(prediction.argmax(axis=1).tolist(),dummy_y_sample_test.argmax(axis=1).tolist()) 

### Finding the balanced error rate

In [None]:
print("\n{:<15} {:<15} {:<15} {:<15} {:<15}\n".format('Actual', 'Predicted 0', 'Predicted 1', 'Predicted 2', 'Error Rate'))
 
# print each data item.
for keyUp, valueUP in predictedValues.items():
  # print(predictedValues.items())
  listOutput = []
  for key, value in predictedValues[keyUp].items():
    listOutput.append(value)
  pred0, pred1, pred2 = listOutput[0], listOutput[1],listOutput[2]
  print("{:<15} {:<15} {:<15} {:<15} {:<15}".format('Class '+str(keyUp), pred0, pred1, pred2,errorRateOnEachClass[str(keyUp)]))

print("\n\nBalanced Error Rate : " + str(total_balanced_error_rate))


Actual          Predicted 0     Predicted 1     Predicted 2     Error Rate     

Class 0         99              10              118             0.5638766519823789
Class 1         14              57              46              0.5128205128205128
Class 2         56              20              206             0.2695035460992908


Overall Error Rate : 0.4487335703007274


### Classification report for the rest of the metrics

In [None]:
from sklearn.metrics import classification_report
target_names = ['class 0', 'class 1', 'class 2']
print(classification_report(prediction.argmax(axis=1).tolist(),dummy_y_sample_test.argmax(axis=1).tolist(), target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.44      0.59      0.50       169
     class 1       0.49      0.66      0.56        87
     class 2       0.73      0.56      0.63       370

    accuracy                           0.58       626
   macro avg       0.55      0.60      0.56       626
weighted avg       0.62      0.58      0.59       626



# SVM

In [None]:
y=y.to_numpy()

In [None]:

import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Binarizer

import matplotlib.pyplot as plt
class DummyScaler:
    def fit(self, data):
        pass
    def transform(self, data):
        return data
def create_scaler_dummy():
    return DummyScaler()
    
def create_scaler_standard():
    return StandardScaler()
def create_scaler_minmax():
    return MinMaxScaler()
def crete_scaler_binarizer():
    return Binarizer()
    
create_scaler = create_scaler_minmax

def create_model_naive_bayes():
    model = GaussianNB()
    return model
def create_model_mlpclassifier():
    model = MLPClassifier(hidden_layer_sizes=(10,), random_state=seed)
    return model
def create_model_svc():
    model = SVC(random_state=seed, probability=True)
    return model
    
create_model = create_model_svc
seed = 520
np.set_printoptions(precision=3)


In [None]:

print('Train the model and predict')
scaler = create_scaler()
scaler.fit(X)
X = scaler.transform(X)
model = create_model()
model.fit(X, y)
y_hat = model.predict(X)
print('Model evaluation (train)')
print('Accuracy:')
print(metrics.accuracy_score(y, y_hat))
print('Classification report:')
print(metrics.classification_report(y, y_hat))
print('Confusion matrix (train)')
print (metrics.confusion_matrix(y, y_hat))
print('Cross-validation')
np.random.seed(seed)
y_prob = np.zeros(y.shape)
y_hat = np.zeros(y.shape)
kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=seed)

for train, test in kfold.split(X, y):
    scaler.fit(X[train])
    X_train = scaler.transform(X[train])
    X_test = scaler.transform(X[test])
    
    model = create_model()
    model.fit(X_train, y[train])
    y_prob[test] = model.predict_proba(X_test)[:, 1]
    y_hat[test] = model.predict(X_test)


Train the model and predict
Model evaluation (train)
Accuracy:
0.6554809843400448
Classification report:
              precision    recall  f1-score   support

           0       0.57      0.60      0.59      1056
           1       0.92      0.42      0.57       519
           2       0.67      0.77      0.72      1554

    accuracy                           0.66      3129
   macro avg       0.72      0.60      0.63      3129
weighted avg       0.68      0.66      0.65      3129

Confusion matrix (train)
[[ 634    5  417]
 [ 132  216  171]
 [ 338   15 1201]]
Cross-validation


In [None]:
print('Model evaluation (CV)')
print('Accuracy:')
print(metrics.accuracy_score(y, y_hat))
print('Classification report:')
print(metrics.classification_report(y, y_hat))
print('Confusion Matrix (CV)')
print(metrics.confusion_matrix(y, y_hat))

print('Grid Search for Hyperparameters')
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, 
test_size=0.2, random_state=520)
scaler = create_scaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
# Here we should use specific classifier, because of the parameters
model = model_selection.GridSearchCV(SVC(kernel='rbf', random_state=520, 
probability=True),
                         cv=5,
                         n_jobs=-1,
                         param_grid={
                             'C': [10**x for x in range(-3, 4)], 
                             'gamma': [10**x for x in range(-3, 4)]
                         })
model.fit(X_train, y_train)
print('Optimal parameters:', model.best_params_)
y_test_hat = model.predict(X_test)
y_test_prob = model.predict_proba(X_test)[:, 1]
print('Model evaluation (Optimal Hyperparameters)')
print('Accuracy:')
print(metrics.accuracy_score(y_test, y_test_hat))
print('Classification report:')
print(metrics.classification_report(y_test, y_test_hat))
print('Confusion matrix (Optimal Hyperparameters)')
print(metrics.confusion_matrix(y, y_hat))

Model evaluation (CV)
Accuracy:
0.62320230105465
Classification report:
              precision    recall  f1-score   support

           0       0.53      0.57      0.55      1056
           1       0.89      0.40      0.55       519
           2       0.65      0.73      0.69      1554

    accuracy                           0.62      3129
   macro avg       0.69      0.57      0.60      3129
weighted avg       0.65      0.62      0.62      3129

Confusion Matrix (CV)
[[ 607    5  444]
 [ 132  209  178]
 [ 399   21 1134]]
Grid Search for Hyperparameters
Optimal parameters: {'C': 1000, 'gamma': 0.01}
Model evaluation (Optimal Hyperparameters)
Accuracy:
0.646964856230032
Classification report:
              precision    recall  f1-score   support

           0       0.59      0.63      0.61       213
           1       0.91      0.39      0.54       103
           2       0.65      0.75      0.70       310

    accuracy                           0.65       626
   macro avg       0.72  

### Predicting the result for the sample test data

In [None]:
total_balanced_error_rate, errorRateOnEachClass, predictedValues = balanced_error_rate(list(y_test_hat),list(y_test)) 

### Finding the balanced error rate

In [None]:
print("\n{:<15} {:<15} {:<15} {:<15} {:<15}\n".format('Actual', 'Predicted 0', 'Predicted 1', 'Predicted 2', 'Error Rate'))
 
# print each data item.
for keyUp, valueUP in predictedValues.items():
  # print(predictedValues.items())
  listOutput = []
  for key, value in predictedValues[keyUp].items():
    listOutput.append(value)
  pred0, pred1, pred2 = listOutput[0], listOutput[1],listOutput[2]
  print("{:<15} {:<15} {:<15} {:<15} {:<15}".format('Class '+str(keyUp), pred0, pred1, pred2,errorRateOnEachClass[str(keyUp)]))

print("\n\nBalanced Error Rate : " + str(total_balanced_error_rate))


Actual          Predicted 0     Predicted 1     Predicted 2     Error Rate     

Class 0         134             1               78              0.34801762114537443
Class 1         18              40              45              0.5384615384615384
Class 2         76              3               231             0.2801418439716312


Overall Error Rate : 0.38887366785951466


### Classification report for the rest of the metrics

In [None]:
from sklearn.metrics import classification_report
target_names = ['class 0', 'class 1', 'class 2']
print(classification_report(list(y_test_hat),list(y_test), target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.63      0.59      0.61       228
     class 1       0.39      0.91      0.54        44
     class 2       0.75      0.65      0.70       354

    accuracy                           0.65       626
   macro avg       0.59      0.72      0.62       626
weighted avg       0.68      0.65      0.65       626

