Importing Numpy and Pandas Library

In [1]:
import numpy as np
import pandas as pd
%pylab inline

Populating the interactive namespace from numpy and matplotlib


Mounting Google Drive to Colab

In [2]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


Reading data from TSV file

In [3]:
columns = ['Index','Category','word_vector','Label']
train = pd.read_csv("/content/gdrive/MyDrive/DMA Data Set/Task_1/Training.tsv",sep='\t',names = columns, header=0)
train.head()

Unnamed: 0,Index,Category,word_vector,Label
0,0,24,"[[-0.507, -0.49518, 0.46885, 0.54524, -0.11552...",2.0
1,1,13,"[[0.19911, -0.46156, 0.19674, -1.3298, 0.51805...",1.0
2,2,24,"[[-0.7403, -0.78746, 0.47018, 0.43474, 0.05842...",1.0
3,3,21,"[[-0.44257, -0.54624, 0.25403, 0.80731, 1.026,...",1.0
4,4,13,"[[0.19911, -0.46156, 0.19674, -1.3298, 0.51805...",2.0


Printing percentage of Nan values in the Dataset

In [4]:
print(train.isna().sum())
# We can see that there are no Null values available in the Training Dataset

Index          0
Category       0
word_vector    0
Label          0
dtype: int64


Resetting the Index of the dataframe and dropping the Index attribute

In [5]:
train.reset_index(inplace=True)
train.drop(['index','Index'],axis='columns', inplace=True)
print(train.shape[0])
train.head()

123040


Unnamed: 0,Category,word_vector,Label
0,24,"[[-0.507, -0.49518, 0.46885, 0.54524, -0.11552...",2.0
1,13,"[[0.19911, -0.46156, 0.19674, -1.3298, 0.51805...",1.0
2,24,"[[-0.7403, -0.78746, 0.47018, 0.43474, 0.05842...",1.0
3,21,"[[-0.44257, -0.54624, 0.25403, 0.80731, 1.026,...",1.0
4,13,"[[0.19911, -0.46156, 0.19674, -1.3298, 0.51805...",2.0


Calculating the Row count of each word vector and calculating the mean of each word vector (vertically)

In [6]:
row_count = []
from tqdm import tqdm,trange
for i in tqdm(range(123040),desc="Numpy Array conversion Progess:"): #Total 123040 rows
  a = eval(train.word_vector[i])
  row_count.append(len(a))
  A = np.array(a,dtype = 'f')
  train.word_vector[i] = np.mean(A,axis=0)
train['row_count'] = row_count

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
Numpy Array conversion Progess:: 100%|██████████| 123040/123040 [04:05<00:00, 502.08it/s]


Creating Data Frame with Hot Encoded Category values

In [7]:
df2 = train.copy() # Training Dataset with Mean of word Vector
df3 = train.copy() #Catagory Hot-Encoded dataset with Mean of Word Vectors

def convert_to_binary(df, column_to_convert):
  categories = list(df[column_to_convert].drop_duplicates())

  for category in categories:
    cat_name = str(category)
    col_name = str(column_to_convert) + '_' + cat_name
    df[col_name] = 0
    df.loc[(df[column_to_convert] == category), col_name] = 1

  return df
df3 = convert_to_binary(df=df3, column_to_convert='Category')
df3.drop('Category', axis=1, inplace=True)

Creating 100 dimensions for 100 values in the word_vector-

In [8]:
dim_vector = {}
for i in range(1,101):
  x = dim_vector.setdefault(i,[])
for i in tqdm(range(123040),desc="Progress:"):#Total 123040 rows
  for j in range(100):
    dim_vector[j+1].append(df3.word_vector[i][j])

Progress:: 100%|██████████| 123040/123040 [02:10<00:00, 940.85it/s]


In [9]:
for i in range(1,101):
  key_name = 'dim_'+str(i)
  # df2[key_name] = dim_vector[i]
  df3[key_name] = dim_vector[i]

In [10]:
from sklearn.preprocessing import LabelEncoder

labels = df3['Label']

# Label encoding for Malware Label Column
le = LabelEncoder()
Y = le.fit_transform(labels)

X = df3.drop(['Label','word_vector'], axis=1, inplace=False)

Splitting the Data into 80:20 Train-Test split

In [11]:
#from sklearn import cross_validation
from sklearn.model_selection import train_test_split
## Spliting of training dataset into 80% training data and 20% testing data randomly
features_train, features_test, labels_train, labels_test = train_test_split(X, Y, test_size=0.2, random_state=1)

Using decision tree to obtain a model from Training data

In [12]:
# features_train.info()
## Decision Tree 
from sklearn import tree
clf = tree.DecisionTreeClassifier(min_samples_split=50, max_depth = 5, criterion = 'entropy')
## min_samples_split=50 max_depth = 5 criterion='entropy'
clf.fit(features_train, labels_train)
prediction = clf.predict(features_test)
from sklearn.metrics import accuracy_score
print(accuracy_score(prediction, labels_test)*100,'%')

74.25633940182055 %


Using Gaussian Naive Bayes

In [13]:
## Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(features_train, labels_train)
prediction = clf.predict(features_test)

## Computing accuracy
print (accuracy_score(prediction, labels_test)*100,'%')

63.4834200260078 %


Decision Tree with GridSearchCV

In [14]:
from sklearn import tree
from sklearn.model_selection import GridSearchCV
parameters = {'criterion':('gini', 'entropy'), 'max_depth':[1, 3, 5, 10]}
dt = tree.DecisionTreeClassifier()
clf = GridSearchCV(dt, parameters)

clf.fit(features_train, labels_train)
prediction = clf.predict(features_test)

## Computing accuracy
from sklearn.metrics import accuracy_score
print (accuracy_score(prediction, labels_test)*100,'%')

78.324122236671 %


XGBoost Classifier

In [15]:
from xgboost import XGBClassifier
clf = XGBClassifier()
clf.fit(np.array(features_train), np.array(labels_train))
prediction = clf.predict(np.array(features_test))
print (accuracy_score(prediction, labels_test)*100,'%')

78.60858257477244 %


Gradient Boositng Classifier

In [16]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0).fit(features_train, labels_train)
prediction = clf.predict(features_test)
from sklearn.metrics import accuracy_score
print (accuracy_score(prediction, labels_test)*100,'%')

79.03120936280884 %


Extra Trees Classifier

In [17]:
from sklearn.ensemble import ExtraTreesClassifier
clf = ExtraTreesClassifier(n_estimators=100,random_state=0)
clf.fit(features_train,labels_train)
prediction = clf.predict(features_test)
print (accuracy_score(prediction, labels_test)*100,'%')

79.68546814044213 %


LGBM Classifier

In [18]:
import lightgbm as lgb
clf = lgb.LGBMClassifier()
clf.fit(features_train, labels_train)
prediction = clf.predict(features_test)
print (accuracy_score(prediction, labels_test)*100,'%')

79.68140442132639 %


Bias Variance Curve

In [19]:
test_perc_list = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
!pip install eli5
from sklearn.model_selection import learning_curve
from sklearn.metrics import mean_squared_log_error
import eli5
from tqdm import tqdm_notebook
from sklearn.model_selection import KFold

Collecting eli5
  Downloading eli5-0.11.0-py2.py3-none-any.whl (106 kB)
[?25l[K     |███                             | 10 kB 28.0 MB/s eta 0:00:01[K     |██████▏                         | 20 kB 31.2 MB/s eta 0:00:01[K     |█████████▎                      | 30 kB 23.0 MB/s eta 0:00:01[K     |████████████▍                   | 40 kB 18.7 MB/s eta 0:00:01[K     |███████████████▌                | 51 kB 8.9 MB/s eta 0:00:01[K     |██████████████████▌             | 61 kB 9.4 MB/s eta 0:00:01[K     |█████████████████████▋          | 71 kB 8.0 MB/s eta 0:00:01[K     |████████████████████████▊       | 81 kB 8.9 MB/s eta 0:00:01[K     |███████████████████████████▉    | 92 kB 7.8 MB/s eta 0:00:01[K     |███████████████████████████████ | 102 kB 8.5 MB/s eta 0:00:01[K     |████████████████████████████████| 106 kB 8.5 MB/s 
Installing collected packages: eli5
Successfully installed eli5-0.11.0




In [20]:
estimator=ExtraTreesClassifier(n_estimators=100,random_state=0)
cv=5
n_jobs=-1
train_sizes, train_scores, test_scores = learning_curve(estimator,X, Y, cv=cv, n_jobs=n_jobs, train_sizes=test_perc_list)

In [21]:
def get_mean(arr):
    final= []
    for i in range(arr.shape[0]):
        final.append(arr[i].mean())
    return final

In [22]:
trainFinal,testFinal = get_mean(train_scores),get_mean(test_scores)
import plotly.graph_objects as go

In [23]:
fig=go.Figure()
x,y,z = test_perc_list,trainFinal,testFinal
fig.add_trace(go.Scatter(x=x , y=y , name = 'Train Score',line_shape = 'linear'))
fig.add_trace(go.Scatter(x=x , y=z , name = 'Test Score',line_shape = 'linear'))
fig.update_layout(title_text = 'Bais Variance TradeOff')
fig.update_xaxes(title_text='Train Size' , showgrid = False)
fig.update_yaxes(title_text='Accuracy' , showgrid = False)
fig.show()