In [1]:
# import required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import GridSearchCV
%matplotlib inline

In [2]:
# reading data from csv
data = pd.read_csv('musk_csv.csv')

In [3]:
data.head()

Unnamed: 0,ID,molecule_name,conformation_name,f1,f2,f3,f4,f5,f6,f7,...,f158,f159,f160,f161,f162,f163,f164,f165,f166,class
0,1,MUSK-211,211_1+1,46,-108,-60,-69,-117,49,38,...,-308,52,-7,39,126,156,-50,-112,96,1
1,2,MUSK-211,211_1+10,41,-188,-145,22,-117,-6,57,...,-59,-2,52,103,136,169,-61,-136,79,1
2,3,MUSK-211,211_1+11,46,-194,-145,28,-117,73,57,...,-134,-154,57,143,142,165,-67,-145,39,1
3,4,MUSK-211,211_1+12,41,-188,-145,22,-117,-7,57,...,-60,-4,52,104,136,168,-60,-135,80,1
4,5,MUSK-211,211_1+13,41,-188,-145,22,-117,-7,57,...,-60,-4,52,104,137,168,-60,-135,80,1


In [4]:
data.describe()

Unnamed: 0,ID,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f158,f159,f160,f161,f162,f163,f164,f165,f166,class
count,6598.0,6598.0,6598.0,6598.0,6598.0,6598.0,6598.0,6598.0,6598.0,6598.0,...,6598.0,6598.0,6598.0,6598.0,6598.0,6598.0,6598.0,6598.0,6598.0,6598.0
mean,3299.5,58.945135,-119.128524,-73.14656,-0.628372,-103.533495,18.359806,-14.108821,-1.85829,-86.003031,...,-184.798272,-75.795696,-26.073204,64.616702,112.037739,201.76023,-47.48833,-150.259927,41.770233,0.154138
std,1904.82287,53.249007,90.813375,67.956235,80.444617,64.387559,80.593655,115.315673,90.372537,108.326676,...,107.819514,127.861271,69.727964,100.861935,72.83504,59.526751,55.069365,76.019023,94.116085,0.361108
min,1.0,-31.0,-199.0,-167.0,-114.0,-118.0,-183.0,-171.0,-225.0,-245.0,...,-328.0,-219.0,-136.0,-120.0,-69.0,73.0,-289.0,-428.0,-471.0,0.0
25%,1650.25,37.0,-193.0,-137.0,-70.0,-117.0,-28.0,-159.0,-85.0,-217.0,...,-272.0,-205.0,-70.0,-18.0,71.0,166.0,-68.0,-179.0,-9.0,0.0
50%,3299.5,44.0,-149.0,-99.0,-25.0,-117.0,33.0,27.0,19.0,-40.0,...,-234.0,-131.0,-21.0,61.5,107.0,191.0,-60.0,-150.0,27.0,0.0
75%,4948.75,53.0,-95.0,-19.0,42.0,-116.0,74.0,57.0,61.0,-21.0,...,-80.0,52.0,9.0,149.0,129.0,215.0,-45.0,-120.0,119.0,0.0
max,6598.0,292.0,95.0,81.0,161.0,325.0,200.0,220.0,320.0,147.0,...,94.0,179.0,192.0,411.0,355.0,625.0,295.0,168.0,367.0,1.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6598 entries, 0 to 6597
Columns: 170 entries, ID to class
dtypes: int64(168), object(2)
memory usage: 8.6+ MB


In [6]:
# dropping the columns and taking only 1017 instance of both classes as the data is very unbalanced 
#  (instances of non musk >> instances of musk)
columns_for_dropping = ['ID','molecule_name','conformation_name','class']

data = data[:2*1017]

X = data.drop(columns_for_dropping,axis=1)
y = data['class']

data['class'].value_counts()

1    1017
0    1017
Name: class, dtype: int64

In [7]:
# scaling the data
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,shuffle=True)

In [8]:
# declaring svm model
# we used the svm model as we know that svm works best when there are more features and less no of examples
svm = SVC(kernel='linear',C=0.1)
svm.fit(X_train,y_train)

pred = svm.predict(X_test)

In [9]:
print(confusion_matrix(y_test,pred))

[[202   0]
 [  3 202]]


In [10]:
print(classification_report(y_test,pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       202
           1       1.00      0.99      0.99       205

    accuracy                           0.99       407
   macro avg       0.99      0.99      0.99       407
weighted avg       0.99      0.99      0.99       407



In [11]:
print('accuracy is --> ',round(accuracy_score(y_test,pred)*100),'%')

accuracy is -->  99.0 %


In [13]:
# import joblib
# joblib.dump(svm,'svm_model.h5')

['svm_model.h5']

In [14]:
#live predictor 
#just paste in your values here (in place of data[1:2]) in the form of a list and it will predict if its class 0 or 1
value = data[1000:1001]             
values = value.drop(columns_for_dropping,axis=1)
values = scaler.transform(values)
prediction = svm.predict(values)
print('predicted --> ',prediction[0])
print('actual -> ',value['class'])

predicted -->  1
actual ->  1000    1
Name: class, dtype: int64
