In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from math import sqrt

# Import necessary modules
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.naive_bayes import GaussianNB

from keras.models import Sequential
from keras.layers import Dense

from IPython.display import display

In [None]:
class Preprocess:
    
    data = None
    def __init__(self,data_dir):
            self.data_dir = data_dir
    
    def preprocess(self,test_size):

            #Loading datasets
            self.data=pd.read_csv(self.data_dir)

            #pre-processing
            #self.data.drop('Data point',inplace=True,axis=1)
            #self.data.drop('Time (s)',inplace=True,axis=1)

            # Removing Empty Records
            self.data.dropna(axis=0,how='all',inplace=True)

            # Removal of NaN Columns
            columns = self.data.columns
            nan_columns = list(filter(lambda x:self.data[x].isna().sum()>0.5*len(self.data),columns))
            self.data = self.data.drop(nan_columns,axis=1)

            # Correlation (Feature Selection)
            corr = self.data.corr()
            columns = corr.columns
            threshold = 0.1 # columns lesser than 0.01 and greater -0.01  not that correlated
            less_important_columns = []
            
            for column in columns:
                column_correlation = corr[column]
                not_correlated_count = 0
                # Checking correlation factor with other columns
                for other_column in columns:
                    if(column_correlation[other_column]<threshold and column_correlation[other_column]>-threshold):
                        # Counting no of columns with which the current column is less correlated
                        not_correlated_count+=1
              
            # Adding to the list if its less correlated with more than 50% of the total columns
            if(not_correlated_count>0.5*len(columns)):less_important_columns.append(column)
            
            self.data = self.data.drop(less_important_columns,axis=1)

            # Filling Values
            
            # lst = ['Feature 1','Label','Feature 4', 'Feature 5','Feature 6','Feature 7','Feature 8','Feature 9','Feature 10','Feature 11','Feature 12']
            # for wrd in lst:self.data[wrd]=self.data[wrd].interpolate()
            # self.data['Feature 3']=self.data['Feature 3'].ffill()
            
            # Finding Categorical and Continuous
            categorical_columns  = list(filter(lambda x:1.*self.data[x].nunique()/self.data[x].count() < 0.05,self.data.columns))
            continuous_columns   = list(set(self.data.columns).difference(set(categorical_columns)))

            for col in categorical_columns:self.data[col] = self.data[col].ffill()
            for col in continuous_columns:self.data[col] = self.data[col].interpolate()

            self.data=self.data.drop(1,axis=0)
            self.data=self.data.drop(0,axis=0)
            
            # Splitting x and y
            x=np.array(self.data.iloc[0:,0:11])
            y=np.array(self.data.iloc[0:,11])

            # data feature scaling
            scaler = StandardScaler()
            scaler = scaler.fit(x)
            x=scaler.transform(x)

            #Train test split
            X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=test_size,random_state=42)  
            
            return [X_train,y_train,X_val,y_val]

In [None]:
class Regression:
    X_train,X_val,X_test1,X_test2 = None,None,None,None
    y_train,y_val,y_test1,y_test2 = None,None,None,None

    RMSE_TEST1  = {'LINEAR REGRESSION':0,'LINEAR REGRESSION SKLEARN':0,'DECISION TREE':0,'RANDOM FOREST':0,'NEURAL NETWORK':0}
    RMSE_TEST2  = {'LINEAR REGRESSION':0,'LINEAR REGRESSION SKLEARN':0,'DECISION TREE':0,'RANDOM FOREST':0,'NEURAL NETWORK':0}

    MSE_TEST1  = {'LINEAR REGRESSION':0,'LINEAR REGRESSION SKLEARN':0,'DECISION TREE':0,'RANDOM FOREST':0,'NEURAL NETWORK':0}
    MSE_TEST2  = {'LINEAR REGRESSION':0,'LINEAR REGRESSION SKLEARN':0,'DECISION TREE':0,'RANDOM FOREST':0,'NEURAL NETWORK':0}


    def __init__(self,X_train,y_train,X_val,y_val,X_test1,y_test1,X_test2,y_test2):
        self.X_train = X_train
        self.X_val = X_val
        self.X_test1 = X_test1
        self.X_test2 = X_test2
        self.y_train = y_train
        self.y_val = y_val
        self.y_test1 = y_test1
        self.y_test2 = y_test2

    def rmse(self,predictions,target):return sqrt(((predictions-target)**2).mean())

    def tabulate(self,options=['all']):
        # options = ['LINEAR REGRESSION','LINEAR REGRESSION SKLEARN']
        E_TEST_lst = []
        if(options[0] == 'all'):E_TEST_lst = [self.RMSE_TEST1,self.RMSE_TEST2,self.MSE_TEST1,self.MSE_TEST2]
        else:
            rmse_test1 = {}
            rmse_test2 = {}
            mse_test1 = {}
            mse_test2 = {}

            lf = list(filter(lambda x:x in self.RMSE_TEST1.keys,options))
            if(len(lf)!=len(options)):
                print("Invalid Options Present")
                return

            for option in options:
                rmse_test1[option] = self.RMSE_TEST1[option]
                rmse_test2[option] = self.RMSE_TEST2[option]
                mse_test1[option] = self.MSE_TEST1[option]
                mse_test2[option] = self.MSE_TEST2[option]

            E_test_lst = [rmse_test1,rmse_test2,mse_test1,mse_test2]

        E_TEST_df = pd.DataFrame(E_TEST_lst,index=['TEST1','TEST2'])
        display(E_TEST_df)
        
    def model(self,model_type = 'decision_tree_sklearn'):

        if(model_type == 'all'):
            self.linear_regression()
            self.linear_regression_sklearn()
            self.decision_tree_sklearn()
            self.random_forest_sklearn()
            self.tabulate(['all'])

        if(model_type == 'linear_regression'):
            self.linear_regression()
            self.tabulate(['LINEAR REGRESSION'])

        elif(model_type == 'linear_regression_sklearn'):
            self.linear_regression_sklearn()
            self.tabulate(['LINEAR REGRESSION SKLEARN'])

        elif(model_type == 'decision_tree_sklearn'):
            self.decision_tree_sklearn()
            self.tabulate(['DECISION TREE SKLEARN'])

        elif(model_type == 'random_forest_sklearn'):
            self.random_forest_sklearn()
            self.tabulate(['RANDOM FOREST SKLEARN'])
            
    def linear_regression(self):
        # linear regression code
        costs_train=[]
        costs_val=[]
        cost_list=[]
        m=self.X_train.shape[0]
        ones= np.ones((m,1))
        self.X_train =np.concatenate((ones,self.X_train),axis=1)
        self.X_val = np.concatenate((np.ones((self.X_val.shape[0],1)),self.X_val),axis=1)
        self.X_test1 = np.concatenate((np.ones((self.X_test1.shape[0],1)),self.X_test1),axis=1)
        self.X_test2 = np.concatenate((np.ones((self.X_test2.shape[0],1)),self.X_test2),axis=1)
        n=self.X_train.shape[1]
        def cost(data,y,params):
            total_cost =0
            for i in range(len(data)):
                total_cost+=((1/(2*m))* ((data[i]*params).sum() -y[i])**2)
            cost_list.append(total_cost)
            #print(cost_list)
            return total_cost
        # gradient descent
        def grad_des(data,y,params,alpha,no_of_iterations):
            costs_array=[]
            for i in range(no_of_iterations):
                slopes = np.zeros (n)
                for j in range(len(data)):
                    for k in range (n):
                        slopes[k] += (1/m)*((data[j]*params).sum() -y[j])*data[j][k]
                params = params - (alpha*slopes)
                costs_array.append(cost(data,y,params))
            #costs.append(costs_array[-1])
            return [params,costs_array[-1]]
        
        sizes =[]
        params = np.zeros(n)
        costs_train=[]
        costs_val=[]
        costs_test=[]
        costs_test2=[]
        
        for i in range(0,len(self.X_train),100):
            params_1=grad_des(self.X_train[0:i],self.y_train[0:i],params,0.1,100)
            costs_train.append(params_1[1])
            sizes.append(i)
            y_predval = np.dot(params_1[0],self.X_val.T)
            costs_val.append((mean_squared_error(self.y_val,y_predval)))
            y_predtest1 = np.dot(params_1[0],self.X_test1.T)
            costs_test.append((mean_squared_error(self.y_test1,y_predtest1)))
            y_predtest2 = np.dot(params_1[0],self.X_test2.T)
            costs_test2.append((mean_squared_error(self.y_test2,y_predtest2)))

        self.RMSE_TEST1['LINEAR REGRESSION'] = self.rmse(y_predtest1,self.y_test1)
        self.RMSE_TEST2['LINEAR REGRESSION'] = self.rmse(y_predtest2,self.y_test2)

        self.MSE_TEST1['LINEAR REGRESSION'] = mean_squared_error(y_predtest1,self.y_test1)
        self.MSE_TEST2['LINEAR REGRESSION'] = mean_squared_error(y_predtest2,self.y_test2)
        
    def linear_regression_sklearn(self):
        model  = LinearRegression().fit(self.X_train,self.y_train)
        y_predtest1 = model.predict(self.X_test1)
        y_predtest2 = model.predict(self.X_test2)
 
        self.RMSE_TEST1['LINEAR REGRESSION SKLEARN'] = self.rmse(y_predtest1,self.y_test1)
        self.RMSE_TEST2['LINEAR REGRESSION SKLEARN'] = self.rmse(y_predtest2,self.y_test2)

        self.MSE_TEST1['LINEAR REGRESSION SKLEARN'] = mean_squared_error(y_predtest1,self.y_test1)
        self.MSE_TEST2['LINEAR REGRESSION SKLEARN'] = mean_squared_error(y_predtest2,self.y_test2)
              
    def decision_tree_sklearn(self):
        model = DecisionTreeRegressor(min_samples_leaf=5,random_state = 0).fit(self.X_train,self.y_train)
        y_predtest1 = model.predict(self.X_test1)
        y_predtest2 = model.predict(self.X_test2)
 
        self.RMSE_TEST1['DECISION TREE'] = self.rmse(y_predtest1,self.y_test1)
        self.RMSE_TEST2['DECISION TREE'] = self.rmse(y_predtest2,self.y_test2)

        self.MSE_TEST1['DECISION TREE'] = mean_squared_error(y_predtest1,self.y_test1)
        self.MSE_TEST2['DECISION TREE'] = mean_squared_error(y_predtest2,self.y_test2)
         
    def random_forest_sklearn(self):
        model = RandomForestRegressor(n_estimators =1000,random_state = 42).fit(self.X_train,self.y_train)
        y_predtest1 = model.predict(self.X_test1)
        y_predtest2 = model.predict(self.X_test2)
 
        self.RMSE_TEST1['RANDOM FOREST'] = self.rmse(y_predtest1,self.y_test1)
        self.RMSE_TEST2['RANDOM FOREST'] = self.rmse(y_predtest2,self.y_test2)

        self.MSE_TEST1['RANDOM FOREST'] = mean_squared_error(y_predtest1,self.y_test1)
        self.MSE_TEST2['RANDOM FOREST'] = mean_squared_error(y_predtest2,self.y_test2)
    
    def neural_network(self):

        # Model
        model = Sequential()
        model.add(Dense(64, activation='relu'))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(1))
        model.compile(loss='mse', optimizer='adam')

        model.fit(self.X_train, self.y_train,validation_data=(self.X_val,self.y_val), epochs=100, batch_size=12)
        y_predtest1=model.predict(self.X_test1)
        y_predtest2=model.predict(self.X_test2)

        self.RMSE_TEST1['NEURAL NETWORK'] = self.rmse(y_predtest1,self.y_test1)
        self.RMSE_TEST2['NEURAL NETWORK'] = self.rmse(y_predtest2,self.y_test2)

        self.MSE_TEST1['NEURAL NETWORK'] = mean_squared_error(y_predtest1,self.y_test1)
        self.MSE_TEST2['NEURAL NETWORK'] = mean_squared_error(y_predtest2,self.y_test2)


In [None]:
class Classification:

    X_train,X_val,X_test1,X_test2 = None,None,None,None
    y_train,y_val,y_test1,y_test2 = None,None,None,None

    ACC_1 = {'DECISION TREE':None,'LOGISTIC REGRESSION':None,'NAIVE BAYES':None,'NEURAL NETWORK':None}
    ACC_2 = {'DECISION TREE':None,'LOGISTIC REGRESSION':None,'NAIVE BAYES':None,'NEURAL NETWORK':None}
    
    def __init__(self,X_train,y_train,X_val,y_val,X_test1,y_test1,X_test2,y_test2):
        self.X_train = X_train
        self.X_val = X_val
        self.X_test1 = X_test1
        self.X_test2 = X_test2
        self.y_train = y_train
        self.y_val = y_val
        self.y_test1 = y_test1
        self.y_test2 = y_test2

    def tabulate(self,options=['all']):
        E_TEST_lst = []
        if(options[0] == 'all'):E_TEST_lst = [self.ACC1,self.ACC2]
        else:
            ACC_test1 = {}
            ACC_test2 = {}

        lf = list(filter(lambda x:x in self.RMSE_TEST1.keys,options))
        if(len(lf)!=len(options)):
            print("Invalid Options Present")
            return

        for option in options:
            ACC_test1[option] = self.ACC_1[option]
            ACC_test2[option] = self.ACC_2[option]

        E_test_lst = [ACC_test1,ACC_test2]

        E_TEST_df = pd.DataFrame(E_TEST_lst,index=['TEST1','TEST2'])
        print(E_TEST_df)


    def model(self,model_type = 'decision_tree'):
        if(model_type == 'all'):
            self.logistic_regression_sklearn()
            self.decision_tree_sklearn()
            self.naive_bayes_sklearn()
            self.tabulate(['all'])

        elif(model_type == 'logistic_regression'):
            self.logistic_regression_sklearn()
            self.tabulate(['LOGISTIC REGRESSION'])

        elif(model_type == 'decision_tree'):
            self.decision_tree_sklearn()
            self.tabulate(['DECISION TREE'])

        elif(model_type == 'naive_bayes'):
            self.naive_bayes_sklearn()
            self.tabulate(['NAIVE BAYES'])

        elif(model_type == 'neural_network'):
            self.neural_network()
            self.tabulate(['NEURAL NETWORK'])

    def logistic_regression_sklearn(self):
        clf = LogisticRegression(random_state=0).fit(self.X_train, self.y_train)
        y_predtest1 = clf.predict(self.X_test1)
        y_predtest2 = clf.predict(self.X_test2)

        self.ACC1['LOGISTIC REGRESSION'] = accuracy_score(y_predtest1,self.y_test1)
        self.ACC2['LOGISTIC REGRESSION'] = accuracy_score(y_predtest2,self.y_test2)


    def generate_confusion_matrix(self,predictions,true_values):
        print(confusion_matrix(predictions,true_values))

    
    def decision_tree_sklearn(self):
        clf = DecisionTreeClassifier(random_state=0).fit(self.X_train,self.y_train)
        y_predtest1 = clf.predict(self.X_test1)
        y_predtest2 = clf.predict(self.X_test2)

        self.ACC1['DECISION TREE'] = accuracy_score(y_predtest1,self.y_test1)
        self.ACC2['DECISION TREE'] = accuracy_score(y_predtest2,self.y_test2)


    def naive_bayes(self):
        gnb = GaussianNB()
        gnb.fit(self.X_train, self.y_train)
        y_predtest1 = gnb.predict(self.X_test1)
        y_predtest2 = gnb.predict(self.X_test2)

        self.ACC1['NAIVE BAYES'] = accuracy_score(y_predtest1,self.y_test1)
        self.ACC2['NAIVE BAYES'] = accuracy_score(y_predtest2,self.y_test2)

    def neural_network(self):

        # No of Outputs
        outputs = len(set(self.y_train))

        # Model Architecture
        model = Sequential()
        model.add(Dense(64, activation='relu'))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(outputs,activation='softmax'))

        model.compile(loss='categorical_crossentropy', optimizer='adam')
        model.fit(self.X_train, self.y_train,validation_data=(self.X_val,self.y_val), epochs=100, batch_size=12)

        y_predtest1=model.predict(self.X_test1)
        y_predtest2=model.predict(self.X_test2)

        self.ACC1['NEURAL NETWORK'] = accuracy_score(y_predtest1,self.y_test1)
        self.ACC2['NEURAL NETWORK'] = accuracy_score(y_predtest2,self.y_test2)


NameError: name 'X_train' is not defined

In [None]:
pre_1 = Preprocess('Test_Dataset_ to_ INTERN.csv')
pre_2 = Preprocess('To Intern_New Dataset for testing.csv')

X_train,y_train,X_val,y_val = pre_1.preprocess(test_size=0.3)
X_test1,y_test1,X_test2,y_test2 = pre_2.preprocess(test_size=0.3)

In [None]:
classification = Classification(X_train,y_train,X_val,y_val,X_test1,y_test1,X_test2,y_test2)
regression = Regression(X_train,y_train,X_val,y_val,X_test1,y_test1,X_test2,y_test2)

model_type = 'regression'

if model_type == 'regression':
  # options for model = ['LINEAR REGRESSION','LINEAR REGRESSION SKLEARN','DECISION TREE','RANDOM FOREST','NEURAL NETWORK']
  regression.model(model_type='all')
else:
  # options for model = ['DECISION TREE','LOGISTIC REGRESSION','NAIVE BAYES','NEURAL NETWORK']
  classification.model(model_type='all')

NameError: name 'Classification' is not defined