IMPORTANT LIBRARIES

In [22]:
import numpy as np
import pandas as pd
import os
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix
from nltk.tokenize import WhitespaceTokenizer
import string
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import LabelBinarizer
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB
import sys
from elftools.elf.elffile import ELFFile
from elftools.elf.descriptions import (describe_symbol_type, describe_symbol_bind)
from elftools.elf.sections import SymbolTableSection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import nltk

DATA EXTRACTION

In [23]:
# Already extracted strings from each file using shell script extract_strings.sh 
# Now extracting ELF header, SEGMENT header, SECTION header from each file 
# Then using extracted strings ,headers and some custom created features to create a dataframe

In [25]:
#PATH---directory of applications/files
#PATH1---directory of strings files
#label---malware(0),benignware(1)
#get_headers_and_strings() returning list of dictionaries ,where each dictionary has headers, custom features and strings of particular file with label
#count - total number of error files and also printing file names

def get_headers_and_strings(PATH,PATH1,label):
  lst = []
  count = 0
  for f in os.listdir(PATH):
        try: elffile = ELFFile(open(PATH+f,"rb"))
        except Exception as e : 
          print(PATH+f+"-----------error---------------"+str(e))
          count += 1
          continue
        #tmp_k - keys names
        #tmp_v - keys values
        tmp_k = list()
        tmp_v = list()
        
        #e_phoff,e_shoff not considered from ELF header
        for key in elffile.header.keys() :
            if(key == "e_phoff" or key == "e_shoff") : continue
            if(key == 'e_ident'):
                for key1 in elffile.header[key].keys() : 
                    tmp_k.append(key1)
                    if(key1 == "EI_MAG") : tmp_v.append(str(elffile.header[key][key1]))
                    else : tmp_v.append(elffile.header[key][key1])
            else : 
                tmp_k.append(key)
                tmp_v.append(elffile.header[key])

        #iterate over each segments header 
        #consider 'p_type' as feature
        #custom feature (n_segments) = number of segments
        tmp_k.append("n_segments")
        tmp_v.append(elffile.num_segments())
        for seg in elffile.iter_segments():
            tmp_k.append(seg.header['p_type'])
            tmp_v.append(1)

        #get symbol table from '.dynsym' 
        #custom labels :
        #nsym - number of symbols
        #glo - number of global symbols
        #local - number of local symbols
        #weak - number of weak symbols
        #glo1 - number of FUNC global symbols
        #local1 - number of FUNC local symbols
        #weak1 - number of FUNC weak symbols
        #glo2 - number of OBJECT global symbols
        #local2 - number of OBJECT local symbols
        #weak2 - number of OBJECT weak symbols
        
        nsym = 0

        glo = 0
        local = 0
        weak = 0

        glo1 = 0
        local1 = 0
        weak1 = 0

        glo2 = 0
        local2 = 0
        weak2 = 0
                
        section = elffile.get_section_by_name('.dynsym')

        if(isinstance(section, SymbolTableSection) == True) : 
            if section['sh_entsize'] != 0 : 
                
                nsym = section.num_symbols()
                
                for symbol in section.iter_symbols() : 
                    symtype = describe_symbol_type(symbol['st_info']['type'])
                    symbind = describe_symbol_bind(symbol['st_info']['bind'])
                    
                    if(symbind == "GLOBAL") : glo += 1
                    elif(symbind == "LOCAL") : local += 1
                    elif(symbind == "WEAK") : weak += 1
                            
                    if(symtype == "FUNC") :
                        if(symbind == "GLOBAL") : glo1 += 1
                        elif(symbind == "LOCAL") : local1 += 1
                        elif(symbind == "WEAK") : weak1 += 1
                    elif(symtype == "OBJECT") :
                        if(symbind == "GLOBAL") : glo2 += 1
                        elif(symbind == "LOCAL") : local2 += 1
                        elif(symbind == "WEAK") : weak2 += 1
                    
        tmp_k.extend(["n_symbols","n_GLOBAL","n_LOCAL","n_WEAK","n_FUNC_GLOBAL","n_FUNC_LOCAL","n_FUNC_WEAK","n_OBJECT_GLOBAL","n_OBJECT_LOCAL","n_OBJECT_WEAK"])
        tmp_v.extend([nsym,glo,local,weak,glo1,local1,weak1,glo2,local2,weak2])

        #Per Section header considered
        #sh_addr,sh_offset,sh_name not considered
        #custom feature(n_sections) = number of sections
        tmp_k.append("n_sections")
        tmp_v.append(elffile.num_sections())
        for section in elffile.iter_sections():
            for key in section.header.keys():
                if(key == 'sh_addr' or key == 'sh_offset' or key == 'sh_name') : continue
                else :
                    tmp_k.append(section.name + "_" + key)
                    tmp_v.append(section.header[key])
        
        try :
            #strings extracted are added here as feature
            with open(PATH1+f.replace('.','')+".txt") as f1:
                      tmp_str = ''
                      for line in f1.read().splitlines() : tmp_str = tmp_str + line + ' '
            tmp_k.append("Strings")
            tmp_v.append(tmp_str.strip())

            tmp_k.append("label")
            tmp_v.append(label)

            lst.append(dict(zip(tmp_k,tmp_v)))
        except : pass
  print("Number of Files showing Error and not included : %d\n" % count)
  return lst

In [26]:
#getting headers and strings
headers_b = get_headers_and_strings("C:\\Users\\KIIT\\Downloads\\IIT Palakkad\\Academics\\semester 2\\ai for cyber security\\assignment\\malware detection ELF linux\\ELF_data\\ELF_data\\ELF_Dataset\\Benignware\\",
                                    "C:\\Users\\KIIT\\Downloads\\IIT Palakkad\\Academics\\semester 2\\ai for cyber security\\assignment\\malware detection ELF linux\\elf_benignware_strings-20220402T173246Z-001\\elf_benignware_strings\\",1)

headers_m = get_headers_and_strings("C:\\Users\\KIIT\\Downloads\\IIT Palakkad\\Academics\\semester 2\\ai for cyber security\\assignment\\malware detection ELF linux\\ELF_data\\ELF_data\\ELF_Dataset\\Malware\\",
                                    "C:\\Users\\KIIT\\Downloads\\IIT Palakkad\\Academics\\semester 2\\ai for cyber security\\assignment\\malware detection ELF linux\\elf_malware_strings-20220402T173247Z-001\\elf_malware_strings\\",0)

C:\Users\KIIT\Downloads\IIT Palakkad\Academics\semester 2\ai for cyber security\assignment\malware detection ELF linux\ELF_data\ELF_data\ELF_Dataset\Benignware\python3-----------error---------------Magic number does not match

C:\Users\KIIT\Downloads\IIT Palakkad\Academics\semester 2\ai for cyber security\assignment\malware detection ELF linux\ELF_data\ELF_data\ELF_Dataset\Benignware\ranlib-----------error---------------Magic number does not match

C:\Users\KIIT\Downloads\IIT Palakkad\Academics\semester 2\ai for cyber security\assignment\malware detection ELF linux\ELF_data\ELF_data\ELF_Dataset\Benignware\rbash-----------error---------------Magic number does not match

C:\Users\KIIT\Downloads\IIT Palakkad\Academics\semester 2\ai for cyber security\assignment\malware detection ELF linux\ELF_data\ELF_data\ELF_Dataset\Benignware\rcp-----------error---------------Magic number does not match

C:\Users\KIIT\Downloads\IIT Palakkad\Academics\semester 2\ai for cyber security\assignment\malware 

In [27]:
#creating dataframe
df = pd.DataFrame(headers_b)
df = df.append(headers_m)
df.reset_index(inplace=True,drop=True)
df.head(5)

Unnamed: 0,EI_MAG,EI_CLASS,EI_DATA,EI_VERSION,EI_OSABI,EI_ABIVERSION,e_type,e_machine,e_version,e_entry,...,.constdata_sh_info,.constdata_sh_addralign,.constdata_sh_entsize,.arm_vfe_header_sh_type,.arm_vfe_header_sh_flags,.arm_vfe_header_sh_size,.arm_vfe_header_sh_link,.arm_vfe_header_sh_info,.arm_vfe_header_sh_addralign,.arm_vfe_header_sh_entsize
0,"[127, 69, 76, 70]",ELFCLASS64,ELFDATA2LSB,EV_CURRENT,ELFOSABI_SYSV,0,ET_DYN,EM_X86_64,EV_CURRENT,19808,...,,,,,,,,,,
1,"[127, 69, 76, 70]",ELFCLASS64,ELFDATA2LSB,EV_CURRENT,ELFOSABI_SYSV,0,ET_DYN,EM_X86_64,EV_CURRENT,5296,...,,,,,,,,,,
2,"[127, 69, 76, 70]",ELFCLASS64,ELFDATA2LSB,EV_CURRENT,ELFOSABI_SYSV,0,ET_DYN,EM_X86_64,EV_CURRENT,16656,...,,,,,,,,,,
3,"[127, 69, 76, 70]",ELFCLASS64,ELFDATA2LSB,EV_CURRENT,ELFOSABI_SYSV,0,ET_DYN,EM_X86_64,EV_CURRENT,1024096,...,,,,,,,,,,
4,"[127, 69, 76, 70]",ELFCLASS64,ELFDATA2LSB,EV_CURRENT,ELFOSABI_SYSV,0,ET_DYN,EM_X86_64,EV_CURRENT,19632,...,,,,,,,,,,


In [28]:
df.shape

(921, 807)

In [29]:
#saving dataframe
df.to_csv("df_elf.csv",index=False)

In [38]:
#fetching dataframe
df = pd.read_csv("df_elf.csv")
df.head(5)

Unnamed: 0,EI_MAG,EI_CLASS,EI_DATA,EI_VERSION,EI_OSABI,EI_ABIVERSION,e_type,e_machine,e_version,e_entry,...,.constdata_sh_info,.constdata_sh_addralign,.constdata_sh_entsize,.arm_vfe_header_sh_type,.arm_vfe_header_sh_flags,.arm_vfe_header_sh_size,.arm_vfe_header_sh_link,.arm_vfe_header_sh_info,.arm_vfe_header_sh_addralign,.arm_vfe_header_sh_entsize
0,"[127, 69, 76, 70]",ELFCLASS64,ELFDATA2LSB,EV_CURRENT,ELFOSABI_SYSV,0,ET_DYN,EM_X86_64,EV_CURRENT,19808,...,,,,,,,,,,
1,"[127, 69, 76, 70]",ELFCLASS64,ELFDATA2LSB,EV_CURRENT,ELFOSABI_SYSV,0,ET_DYN,EM_X86_64,EV_CURRENT,5296,...,,,,,,,,,,
2,"[127, 69, 76, 70]",ELFCLASS64,ELFDATA2LSB,EV_CURRENT,ELFOSABI_SYSV,0,ET_DYN,EM_X86_64,EV_CURRENT,16656,...,,,,,,,,,,
3,"[127, 69, 76, 70]",ELFCLASS64,ELFDATA2LSB,EV_CURRENT,ELFOSABI_SYSV,0,ET_DYN,EM_X86_64,EV_CURRENT,1024096,...,,,,,,,,,,
4,"[127, 69, 76, 70]",ELFCLASS64,ELFDATA2LSB,EV_CURRENT,ELFOSABI_SYSV,0,ET_DYN,EM_X86_64,EV_CURRENT,19632,...,,,,,,,,,,


GETTING DATAFRAME READY

In [39]:
#there are some columns which have null values
cnt = 0
for c in df.columns:
    if(df[c].isnull().values.any()):cnt += 1
print("Number of features that contain null value : ",cnt)

Number of features that contain null value :  747


In [40]:
#function will drop categorical feature
#And >>>> create separate feature for each unique value of feature (more then 2 unique values for features)
# >>>>> modify feature as 0 or 1 value (2 unique value)
 
def preprocess_cat_fea(df,feature):
    lb = LabelBinarizer()
    lb.fit(df[feature])
    trans = lb.transform(df[feature])
    if(len(pd.unique(df[feature])) > 2) : tmp_col = [feature+"("+str(v)+")" for v in pd.unique(df[feature])]
    else : tmp_col = [feature+"_modified"]
    trans_df = pd.DataFrame(trans,columns=tmp_col,index=df.index)
    return pd.concat([df,trans_df],axis=1).drop([feature], axis=1)

In [42]:
#don't consider 'Strings' feature
#if column has string values then it is categorical feature
#if number of unique values of categorical feature is greater than 1 , then we need to preprocess categorical feature
#else drop the categorical columns with single unique values
#before preprocess fill null values of categorical feature with "NAN"
#Also convert all the values of feature to string
#Also fill null values of other features with 0
categorical_features = list()
for col in df.columns :
  if(col == "Strings") : continue
  is_str = False
  for uni in pd.unique(df[col]) : 
    if(type(uni) is str) :
      if(len(pd.unique(df[col])) > 1) :
        is_str = True
        break
      else : df.drop([col],axis=1,inplace = True)
  if(is_str) : 
    df.fillna("NAN",inplace = True)
    df[col] = df[col].astype(str)
    df = preprocess_cat_fea(df,col)
    categorical_features.append(col)
  else : df.fillna(0,inplace = True) 

In [44]:
#list of categorical features
categorical_features[:5]

['EI_CLASS', 'EI_DATA', 'EI_OSABI', 'e_type', 'e_machine']

In [45]:
df.shape

(921, 829)

In [46]:
#there are some columns which have null values
cnt = 0
for c in df.columns:
    if(df[c].isnull().values.any()):cnt += 1
print("Number of features that contain null value : ",cnt)

Number of features that contain null value :  0


PREPROCESS STRINGS

In [47]:
#preprocess function
def preprocess(mod_data) : 
    #lowercase
    mod_data['Strings']= mod_data['Strings'].apply(lambda x: x.lower())
    #whitespace tokenizer
    tok = WhitespaceTokenizer()
    mod_data['Strings']= mod_data['Strings'].apply(lambda x: " ".join(tok.tokenize(x)))

    return mod_data

In [48]:
#preprocessing strings
df = preprocess(df)
df.shape

(921, 829)

COUNT VECTORIZER TO GET STRINGS VECTORS

In [49]:
#using COUNT VECTORIZER to get vectors for each string
#then dropping original strings column and replacing it with new Count Vectorizer vectors
#removing words which either appears in more then 90% of document or less then 5% of document 
cv = CountVectorizer(max_df = 0.9, min_df = 0.05)
trans_df = pd.DataFrame(cv.fit_transform(df["Strings"]).toarray(),index=df.index)
df = pd.concat([df,trans_df],axis=1).drop(["Strings"], axis=1)
df.head(5)

Unnamed: 0,EI_ABIVERSION,e_entry,e_flags,e_ehsize,e_phentsize,e_phnum,e_shentsize,e_shnum,e_shstrndx,n_segments,...,1845,1846,1847,1848,1849,1850,1851,1852,1853,1854
0,0,19808,0,64,56,13,64,30,29,13,...,0,0,0,0,0,0,0,0,0,1
1,0,5296,0,64,56,13,64,30,29,13,...,0,0,0,0,0,0,0,0,0,0
2,0,16656,0,64,56,13,64,31,30,13,...,0,0,0,0,1,0,0,0,1,0
3,0,1024096,0,64,56,13,64,29,28,13,...,0,0,0,0,0,1,0,0,0,0
4,0,19632,0,64,56,13,64,32,31,13,...,0,0,0,0,0,0,0,0,0,0


In [50]:
#splitting data into X(features) and Y(labels) for further operations
Y = df['label']
X = df.drop(['label'],axis=1)

FEATURE SCALING

In [51]:
#Feature scaling using MinMaxScaler, so that data will lie between (0,1)
mms = MinMaxScaler()
X = pd.DataFrame(mms.fit_transform(X),index=X.index)
X.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2672,2673,2674,2675,2676,2677,2678,2679,2680,2681
0,0.0,4e-06,0.0,1.0,1.0,0.909091,1.0,0.735294,0.787879,0.909091,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013699
1,0.0,1e-06,0.0,1.0,1.0,0.909091,1.0,0.735294,0.787879,0.909091,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,3e-06,0.0,1.0,1.0,0.909091,1.0,0.764706,0.818182,0.909091,...,0.0,0.0,0.0,0.0,0.018519,0.0,0.0,0.0,0.022222,0.0
3,0.0,0.000212,0.0,1.0,1.0,0.909091,1.0,0.705882,0.757576,0.909091,...,0.0,0.0,0.0,0.0,0.0,0.034483,0.0,0.0,0.0,0.0
4,0.0,4e-06,0.0,1.0,1.0,0.909091,1.0,0.794118,0.848485,0.909091,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


FEATURE SELECTIONS

In [52]:
#Removing features with 0 variance or same values
vt = VarianceThreshold(0.05)
X = pd.DataFrame(vt.fit_transform(X),index=X.index)
X.shape

(921, 724)

In [53]:
#selecting 1000 best feature after doing chi-squared on it
X = pd.DataFrame(SelectKBest(chi2, k=100).fit_transform(X, Y),index=X.index)
X.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1.0,1.0,0.909091,1.0,0.909091,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.909091,1.0,0.909091,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
2,1.0,1.0,0.909091,1.0,0.909091,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0
3,1.0,1.0,0.909091,1.0,0.909091,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
4,1.0,1.0,0.909091,1.0,0.909091,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0


SPLITTING DATAFRAME IN TRAINING AND TESTING

In [54]:
#spliting Dataframe in training and testing set as 70% and 30%
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, shuffle=True, random_state=42)

In [55]:
#function for displaying accuracy,f1-score,confusion matrix,recall,precision
def show_report(test,pred,type):
  print(type+"=====================================================================================>>>\n")
  print("\nAccuracy : {:.4f}".format(accuracy_score(test, pred)))
  print("\nClassification report : \n", metrics.classification_report(test, pred))
  print("\nConfusion Matrix : \n", metrics.confusion_matrix(test, pred))

MULTINOMIAL NAIVE BAYES

In [56]:
#getting good accuracy on testing dataset
mnb = MultinomialNB().fit(X_train,Y_train)
pred_train = mnb.predict(X_train)
show_report(Y_train,pred_train,"TRAIN")
pred_test = mnb.predict(X_test)
show_report(Y_test,pred_test,"TEST")






Accuracy : 0.9674



Classification report : 

               precision    recall  f1-score   support



           0       1.00      0.94      0.97       347

           1       0.93      1.00      0.97       297



    accuracy                           0.97       644

   macro avg       0.97      0.97      0.97       644

weighted avg       0.97      0.97      0.97       644





Confusion Matrix : 

 [[326  21]

 [  0 297]]






Accuracy : 0.9856



Classification report : 

               precision    recall  f1-score   support



           0       1.00      0.97      0.99       136

           1       0.97      1.00      0.99       141



    accuracy                           0.99       277

   macro avg       0.99      0.99      0.99       277

weighted avg       0.99      0.99      0.99       277





Confusion Matrix : 

 [[132   4]

 [  0 141]]


GAUSSIAN NAIVE BAYES

In [57]:
#getting very good accuracy on training as well as testing dataset
gnb = GaussianNB().fit(X_train,Y_train)
pred_train = gnb.predict(X_train)
show_report(Y_train,pred_train,"TRAIN")
pred_test = gnb.predict(X_test)
show_report(Y_test,pred_test,"TEST")






Accuracy : 1.0000



Classification report : 

               precision    recall  f1-score   support



           0       1.00      1.00      1.00       347

           1       1.00      1.00      1.00       297



    accuracy                           1.00       644

   macro avg       1.00      1.00      1.00       644

weighted avg       1.00      1.00      1.00       644





Confusion Matrix : 

 [[347   0]

 [  0 297]]






Accuracy : 0.9928



Classification report : 

               precision    recall  f1-score   support



           0       0.99      1.00      0.99       136

           1       1.00      0.99      0.99       141



    accuracy                           0.99       277

   macro avg       0.99      0.99      0.99       277

weighted avg       0.99      0.99      0.99       277





Confusion Matrix : 

 [[136   0]

 [  2 139]]


BERNOULLI NAIVE BAYES

In [58]:
#getting very good accuracy testing dataset
bnb = BernoulliNB().fit(X_train,Y_train)
pred_train = bnb.predict(X_train)
show_report(Y_train,pred_train,"TRAIN")
pred_test = bnb.predict(X_test)
show_report(Y_test,pred_test,"TEST")






Accuracy : 0.9969



Classification report : 

               precision    recall  f1-score   support



           0       1.00      0.99      1.00       347

           1       0.99      1.00      1.00       297



    accuracy                           1.00       644

   macro avg       1.00      1.00      1.00       644

weighted avg       1.00      1.00      1.00       644





Confusion Matrix : 

 [[345   2]

 [  0 297]]






Accuracy : 0.9964



Classification report : 

               precision    recall  f1-score   support



           0       0.99      1.00      1.00       136

           1       1.00      0.99      1.00       141



    accuracy                           1.00       277

   macro avg       1.00      1.00      1.00       277

weighted avg       1.00      1.00      1.00       277





Confusion Matrix : 

 [[136   0]

 [  1 140]]


LOGISTIC REGRESSION

In [59]:
#getting very good accuracy on both training and testing dataset
lr = LogisticRegression(max_iter=1000,n_jobs=-1).fit(X_train,Y_train)
pred_train = lr.predict(X_train)
show_report(Y_train,pred_train,"TRAIN")
pred_test = lr.predict(X_test)
show_report(Y_test,pred_test,"TEST")






Accuracy : 1.0000



Classification report : 

               precision    recall  f1-score   support



           0       1.00      1.00      1.00       347

           1       1.00      1.00      1.00       297



    accuracy                           1.00       644

   macro avg       1.00      1.00      1.00       644

weighted avg       1.00      1.00      1.00       644





Confusion Matrix : 

 [[347   0]

 [  0 297]]






Accuracy : 0.9964



Classification report : 

               precision    recall  f1-score   support



           0       0.99      1.00      1.00       136

           1       1.00      0.99      1.00       141



    accuracy                           1.00       277

   macro avg       1.00      1.00      1.00       277

weighted avg       1.00      1.00      1.00       277





Confusion Matrix : 

 [[136   0]

 [  1 140]]


RANDOM FOREST

In [60]:
#getting very high accuracy for training and testing dataset
rf = RandomForestClassifier(n_estimators=200, max_depth=200, n_jobs=-1, random_state=0).fit(X_train,Y_train)
pred_train = rf.predict(X_train)
show_report(Y_train,pred_train,"TRAIN")
pred_test = rf.predict(X_test)
show_report(Y_test,pred_test,"TEST")






Accuracy : 1.0000



Classification report : 

               precision    recall  f1-score   support



           0       1.00      1.00      1.00       347

           1       1.00      1.00      1.00       297



    accuracy                           1.00       644

   macro avg       1.00      1.00      1.00       644

weighted avg       1.00      1.00      1.00       644





Confusion Matrix : 

 [[347   0]

 [  0 297]]






Accuracy : 0.9964



Classification report : 

               precision    recall  f1-score   support



           0       0.99      1.00      1.00       136

           1       1.00      0.99      1.00       141



    accuracy                           1.00       277

   macro avg       1.00      1.00      1.00       277

weighted avg       1.00      1.00      1.00       277





Confusion Matrix : 

 [[136   0]

 [  1 140]]


SUPPORT VECTOR MACHINE

In [61]:
#getting very good accuracy on both training and testing dataset
svc = SVC(C=0.4).fit(X_train,Y_train)
pred_train = svc.predict(X_train)
show_report(Y_train,pred_train,"TRAIN")
pred_test = svc.predict(X_test)
show_report(Y_test,pred_test,"TEST")






Accuracy : 0.9984



Classification report : 

               precision    recall  f1-score   support



           0       1.00      1.00      1.00       347

           1       1.00      1.00      1.00       297



    accuracy                           1.00       644

   macro avg       1.00      1.00      1.00       644

weighted avg       1.00      1.00      1.00       644





Confusion Matrix : 

 [[346   1]

 [  0 297]]






Accuracy : 0.9964



Classification report : 

               precision    recall  f1-score   support



           0       0.99      1.00      1.00       136

           1       1.00      0.99      1.00       141



    accuracy                           1.00       277

   macro avg       1.00      1.00      1.00       277

weighted avg       1.00      1.00      1.00       277





Confusion Matrix : 

 [[136   0]

 [  1 140]]


BEST MODEL (ACCURACY)

MultinomialNB(0.9856) < GaussianNB(0.9928) < BernoulliNB(0.9964) = SVM-RBF(0.9964) = Logistic Regression(0.9964) = Random Forest(0.9964)