In [26]:
from __future__ import print_function
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
import pandas as pd
from pprint import pprint
import operator
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split as split_data
import math
import numpy as np


class FeatureSelection:

    def __init__(self, csv, num_feature_select):

        self.information_gain = {}  # Information gain for all features numbered 0 - (n - 1)
        self.num_feature_select = num_feature_select  # Number of top features to select
        self.top_n_features = []  # Top n features

         
        # reading the data and setting the header
        self.csv_data = pd.read_csv(csv, header=1)

        # separation of input and output - X, Y
        self.X = self.csv_data.iloc[:, :-1]
        self.X = scaler.fit_transform(self.X) #  Standardized using Normal Distribution 

        self.Y = self.csv_data.iloc[:, -1]


    #expected information gain
    def exp_IG(self):
        x = self.X #input
        y = self.Y.values[:] #output

        #calculation of entropy pre-defined formula
        def _entropy(values): #initial input is y which is output
            uniqw, inverse = np.unique(values, return_inverse=True) 
            counts = np.bincount(inverse)
            probs = counts[np.flatnonzero(counts)] / float(len(values))
            # print(1 - probs, probs)
            return np.sum(probs * np.exp(1 - probs))

        # calc updated entropy for given input x, output y
        def ig(feature, y):
            feature_set_indices=[]
            feature_not_set_indices=[]
            for i in range(len(feature)):
                if feature[i] >=0 :
                    feature_set_indices.append(i)
                else :
                    feature_not_set_indices.append(i)
            # print(feature_set_indices)
            # print(feature_not_set_indices)
            # feature_set_indices = np.nonzero(feature) #find index of non zero elements index as tuple with x, y
            # print(feature_set_indices)
            # feature_not_set_indices = [i for i in feature_range if i not in feature_set_indices[0]] # find the zero element x position index
            # print(feature_not_set_indices)
            entropy_x_set = _entropy(y[feature_set_indices]) #calc entropy for non zero elements
            entropy_x_not_set = _entropy(y[feature_not_set_indices]) #calc entropy for zero elements

            #return the updated entropy
            return entropy_before - (((len(feature_set_indices) / float(feature_size)) * entropy_x_set)
                                     + ((len(feature_not_set_indices) / float(feature_size)) * entropy_x_not_set))

        feature_size = x.shape[0] #number of features
        feature_range = range(0, feature_size) #possible number of features
        entropy_before = _entropy(y)
        # print(entropy_before)
        information_gain_scores = [] # assuming that the elements score are in the indexed order of the input features
        # print(x.T.shape)
        for feature in x.T: #for each column in input
            # print(feature)
            information_gain_scores.append(ig(feature, y)) #find the entropy of each feature x , store it
        # print(information_gain_scores)
        
        info_gain = {} 

        for i in range(self.X.shape[1]): #total number of features
            # feature index = its entropy/information-gain-score
            info_gain[str(i)] = information_gain_scores[i]

        #sorted infromation about features
        # print("==============================")
        # print("before sort")
        # print(info_gain)
        # print("==============================")
        info_gain = sorted(info_gain.items(), key=operator.itemgetter(1), reverse=True)
        # print("==============================")
        # print("after sort")
        # print(info_gain)
        # print("==============================")


        #for each feature column
        for i in range(self.X.shape[1]):
            if i < self.num_feature_select: #if i is less than the required number of features that we need
                self.top_n_features.append(int(info_gain[i][0])) # append feature information to list we require about top feautres
            self.information_gain[info_gain[i][0]] = info_gain[i][1]

        # return information_gain_scores, []

    def mutual_info_calculator(self):
        information_gain = []
        information_gain.append(mutual_info_regression(self.X, self.Y, discrete_features=self.discrete_features))

        info_gain = {}

        for i in range(self.X.shape[1]):
            info_gain[str(i)] = information_gain[0][i]

        info_gain = sorted(info_gain.items(), key=operator.itemgetter(1), reverse=True)

        #for each feature column
        for i in range(self.X.shape[1]):
            if i < self.num_feature_select:
                self.top_n_features.append(int(info_gain[i][0]))
            self.information_gain[info_gain[i][0]] = info_gain[i][1]


p = FeatureSelection('cocomo.csv', 10)

# import pandas as pd
# print(dtype(dataset))

p.exp_IG()
print("==============================")
print("==============================")
print("information gain")
print(p.information_gain)
print("==============================")
print("==============================")
print("top 10 features")
print(p.top_n_features)


information gain
{'1': 0.045370512395678464, '5': 0.045182293746151014, '11': 0.04463990360528669, '2': 0.044287041693371165, '15': 0.04410990137510762, '6': 0.043747231474580595, '4': 0.043558161414487984, '12': 0.043360924542535706, '16': 0.042929158026844494, '7': 0.042046334889543235, '10': 0.041642856003507145, '9': 0.041553004391484016, '0': 0.04138243845446432, '3': 0.04138243845446432, '8': 0.04067306818926486, '13': 0.04059059592643699, '14': 0.040416234772919335}
top 10 features
[1, 5, 11, 2, 15, 6, 4, 12, 16, 7]


In [27]:
# Linear Regression 
x_data=p.X[:, p.top_n_features]
y_data=p.Y 
x_train, x_test, y_train, y_test = split_data( x_data, y_data, test_size = 1/4, random_state = 80, shuffle=True) 
regressor = LinearRegression()
regressor = regressor.fit(x_train, y_train)
y_pred = regressor.predict(x_test)
# Root Mean Sqaured Error
#MSE = np.square(np.subtract(y_test,y_pred)).mean() 

MSE = np.square(np.subtract(y_test,y_pred)/np.maximum(np.absolute(y_test),np.absolute(y_pred))).mean() 
# print(MSE)
RMSE = math.sqrt(MSE)
print("Root Mean Squared Error :",RMSE)

Root Mean Squared Error : 0.8787794667110665


In [28]:
# Mean Absolute Error
from sklearn.metrics import mean_absolute_error as MAE
#error = MAE(y_test, y_pred)
error = (np.subtract(y_test,y_pred)/np.maximum(np.absolute(y_test),np.absolute(y_pred))).mean()
print(f'MAE error is {error}')

MAE error is 0.0798405588783106


In [29]:
# SVM
from sklearn.model_selection import train_test_split as split_data
X_train, X_test, Y_train, Y_test = split_data(x_data, y_data , test_size = 0.2 , random_state = 43, shuffle=True)

from sklearn.svm import SVR
from sklearn.metrics import accuracy_score
svr=SVR() 
svr.fit(X_train,Y_train)
Y_pred=svr.predict(X_test)

#Root Mean Squared Error
MSE = np.square(np.subtract(Y_test,Y_pred)/np.maximum(np.absolute(Y_test),np.absolute(Y_pred))).mean() 
RMSE = math.sqrt(MSE)
print("Root Mean Square Error:\n")
print(RMSE)

# Mean Absolute Error
from sklearn.metrics import mean_absolute_error as MAE
error = (np.subtract(Y_test,Y_pred)/np.maximum(np.absolute(Y_test),np.absolute(Y_pred))).mean()
print(f'MAE error is {error}')

Root Mean Square Error:

0.8112363883626443
MAE error is -0.14391242208394053


In [30]:
# Random Forest
from sklearn.model_selection import train_test_split as split_data
x_train, x_test, y_train, y_test = split_data(x_data, y_data, test_size=0.20, shuffle=True)

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
treeModel = DecisionTreeRegressor(max_depth=5, random_state=None)
treeModel.fit(x_train, y_train)
model = RandomForestRegressor(max_depth=5, random_state=None,max_features='auto',max_leaf_nodes=5,n_estimators=50, criterion="absolute_error")
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

# Mean Absolute Error
from sklearn.metrics import mean_absolute_error as MAE
error = (np.subtract(y_test,y_pred)/np.maximum(np.absolute(y_test),np.absolute(y_pred))).mean()
print(f'MAE error is {error}')

# Root Mean Squared Error
MSE = np.square(np.subtract(y_test,y_pred)/np.maximum(np.absolute(y_test),np.absolute(y_pred))).mean() 
RMSE = math.sqrt(MSE)
print("Root Mean Square Error:")
print(RMSE)

MAE error is -0.4672886257788695
Root Mean Square Error:
0.6730665237848918
