In [36]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from collections import defaultdict
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
import sklearn
import pickle
from sklearn.metrics import classification_report, confusion_matrix 
import importlib.util

In [37]:
# importing template file 
spec = importlib.util.spec_from_file_location("Template", "/home/admin1/PycharmProjects/ML/Week10/Template/template.py")
foo = importlib.util.module_from_spec(spec)
spec.loader.exec_module(foo)
# creating object of Template class
temp = foo.Template()

In [38]:
# read file
df=pd.read_csv('Data/1625Data.txt',names=['octamers','flags'])
df.head()

Unnamed: 0,octamers,flags
0,SLNLRETN,1
1,AECFRIFD,1
2,HLVEALYL,1
3,TQIMFETF,1
4,AEELAEIF,1


In [39]:
df.shape

(1625, 2)

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1625 entries, 0 to 1624
Data columns (total 2 columns):
octamers    1625 non-null object
flags       1625 non-null int64
dtypes: int64(1), object(1)
memory usage: 25.5+ KB


In [41]:
df.describe()

Unnamed: 0,flags
count,1625.0
mean,-0.538462
std,0.842909
min,-1.0
25%,-1.0
50%,-1.0
75%,-1.0
max,1.0


In [42]:
# checks for null values
df.isnull().sum()

octamers    0
flags       0
dtype: int64

In [43]:
# checks for duplicate values
df.duplicated().sum()

0

In [44]:
df.replace({-1:0,1:1},inplace=True)
df.head()

Unnamed: 0,octamers,flags
0,SLNLRETN,1
1,AECFRIFD,1
2,HLVEALYL,1
3,TQIMFETF,1
4,AEELAEIF,1


In [45]:
df.head()

Unnamed: 0,octamers,flags
0,SLNLRETN,1
1,AECFRIFD,1
2,HLVEALYL,1
3,TQIMFETF,1
4,AEELAEIF,1


In [46]:
# Seperate all amino acids
octamers = np.array([[df["octamers"][i][j] for i in range(df.shape[0])] for j in range(8)])
print(octamers)


[['S' 'A' 'H' ... 'Q' 'D' 'S']
 ['L' 'E' 'L' ... 'A' 'S' 'K']
 ['N' 'C' 'V' ... 'I' 'A' 'D']
 ...
 ['E' 'I' 'L' ... 'A' 'E' 'A']
 ['T' 'F' 'Y' ... 'L' 'E' 'E']
 ['N' 'D' 'L' ... 'Q' 'D' 'I']]


In [47]:
# Store the seperated amino acids into a dataframe
df1=pd.DataFrame(octamers.T, columns=list('ABCDEFGH'))

In [48]:
df=pd.concat([df1,df],axis=1)
df.drop(columns=['octamers'],inplace=True)
df.head()

Unnamed: 0,A,B,C,D,E,F,G,H,flags
0,S,L,N,L,R,E,T,N,1
1,A,E,C,F,R,I,F,D,1
2,H,L,V,E,A,L,Y,L,1
3,T,Q,I,M,F,E,T,F,1
4,A,E,E,L,A,E,I,F,1


In [49]:
train,test=temp.split(df,0.2)

In [50]:
print("train",train.shape)
print("test",test.shape)

train (1300, 9)
test (325, 9)


In [51]:
# saving datasets into csv filesS
temp.save_csv(test,'test_data.csv')
temp.save_csv(train,'train_data.csv')


In [52]:
# loading training data csv file
train_df = temp.read_file('train_data.csv')
train_df.head()

Unnamed: 0,A,B,C,D,E,F,G,H,flags
0,Q,A,W,I,R,G,C,R,0
1,P,R,V,S,L,A,M,T,1
2,L,P,N,F,S,S,L,N,0
3,P,V,K,L,K,P,G,M,0
4,S,G,I,M,F,Q,S,A,1


In [53]:
# splitting training data into train and cross validation dataset 
train_data,cv_data=temp.split(train_df,0.2)

In [54]:
# saving cross validation data into csv file
temp.save_csv(cv_data,'cv_data.csv')

In [55]:
# separating features and labels of training dataset
x_train=train_data.iloc[:,:-1].values
y_train=train_data.iloc[:,8].values

In [56]:
x_train=pd.DataFrame(x_train)

In [57]:
x_train=temp.oneHotEncoding(x_train)

In [58]:
print(x_train.shape)

(1040, 160)


In [59]:
# feature scaling
sc=StandardScaler()
x_train=sc.fit_transform(x_train)
# test=sc.transform(test)

In [60]:
# fit model
classifier = KNeighborsClassifier(n_neighbors=5)  
classifier.fit(x_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [61]:
y_pred = classifier.predict(x_train)
df = pd.DataFrame({'Actual': y_train, 'Predicted': y_pred})  
df.head()

Unnamed: 0,Actual,Predicted
0,0,0
1,1,1
2,0,0
3,0,0
4,0,0


In [62]:
# making confusion matrix
cm= confusion_matrix(y_train,y_pred)
print(cm)

[[724  82]
 [ 14 220]]


In [63]:
# loading cross validation dataset file
cv_data = temp.read_file('cv_data.csv')
cv_data.head()

Unnamed: 0,A,B,C,D,E,F,G,H,flags
0,R,F,A,N,Y,I,D,K,0
1,T,S,A,A,S,S,S,N,0
2,S,G,A,F,M,T,R,G,1
3,E,E,S,R,I,S,L,P,0
4,Y,R,G,Y,S,L,G,N,0


In [64]:
# separating features and labels of training dataset
x_cv=cv_data.iloc[:,:-1].values
y_cv=cv_data.iloc[:,8].values
x_cv.shape

(260, 8)

In [65]:
x_cv=pd.DataFrame(x_cv)
x_cv.shape

(260, 8)

In [66]:
x_cv=temp.oneHotEncoding(x_cv)

In [67]:
x_cv.shape

(260, 160)

In [68]:
# feature scaling
x_cv=sc.fit_transform(x_cv)

In [69]:
class KNN:
    
    def get_predictions(self,x):        
        # getting prediction values
        y_pred = classifier.predict(x)
        return y_pred
    
    def create_confusion_matrix(self,y,y_pred):
        # making confusion matrix
        cm= confusion_matrix(y,y_pred)
        return cm
    
    def get_accuracy(self,y_train,y_pred):
        Accuracy = sklearn.metrics.balanced_accuracy_score(y_train,y_pred)*100
        return Accuracy
        
        
def main():
    # creates class object 
    obj = KNN()
    y_pred_train = obj.get_predictions(x_train)
    y_pred_test = obj.get_predictions(x_cv)
    
    cm_train=obj.create_confusion_matrix(y_train,y_pred_train)
    print("train matrix\n",cm_train)
    
    cm_cv=obj.create_confusion_matrix(y_cv,y_pred_test)
    print("cv matrix\n",cm_cv)
    
    acc_train = obj.get_accuracy(y_train,y_pred_train)
    print("Accuracy of train data =",acc_train)
    
    acc_test = obj.get_accuracy(y_cv,y_pred_test)
    print("Accuracy of test data =",acc_test)
    
    if acc_train >= 80 and acc_test >=  60:
        fileObject = open("train_data.pkl",'wb')
        pickle.dump(classifier,fileObject)   
        pickle.dump(sc,fileObject)
        # here we close the fileObject
        fileObject.close()

        
if __name__ == '__main__':
    main()

train matrix
 [[724  82]
 [ 14 220]]
cv matrix
 [[165  41]
 [  2  52]]
Accuracy of train data = 91.92169837331127
Accuracy of test data = 88.19669183746853
