In [49]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from collections import defaultdict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
import sklearn
import importlib.util
import pickle

In [50]:
# importing template file 
spec = importlib.util.spec_from_file_location("Template", "/home/admin1/PycharmProjects/ML/Week10/Template/template.py")
foo = importlib.util.module_from_spec(spec)
spec.loader.exec_module(foo)
# creating object of Template class
temp = foo.Template()

In [51]:
# read file
df=pd.read_csv('Data/1625Data.txt',names=['octamers','flags'])
df.head()

Unnamed: 0,octamers,flags
0,SLNLRETN,1
1,AECFRIFD,1
2,HLVEALYL,1
3,TQIMFETF,1
4,AEELAEIF,1


In [52]:
df.shape

(1625, 2)

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1625 entries, 0 to 1624
Data columns (total 2 columns):
octamers    1625 non-null object
flags       1625 non-null int64
dtypes: int64(1), object(1)
memory usage: 25.5+ KB


In [54]:
df.describe()

Unnamed: 0,flags
count,1625.0
mean,-0.538462
std,0.842909
min,-1.0
25%,-1.0
50%,-1.0
75%,-1.0
max,1.0


In [55]:
# checks for null values
df.isnull().sum()

octamers    0
flags       0
dtype: int64

In [56]:
# checks for duplicate values
df.duplicated().sum()

0

In [57]:
df.replace({-1:0,1:1},inplace=True)
df.head()

Unnamed: 0,octamers,flags
0,SLNLRETN,1
1,AECFRIFD,1
2,HLVEALYL,1
3,TQIMFETF,1
4,AEELAEIF,1


In [58]:
df.head()

Unnamed: 0,octamers,flags
0,SLNLRETN,1
1,AECFRIFD,1
2,HLVEALYL,1
3,TQIMFETF,1
4,AEELAEIF,1


In [59]:
# Seperate all amino acids
octamers = np.array([[df["octamers"][i][j] for i in range(df.shape[0])] for j in range(8)])
print(octamers)


[['S' 'A' 'H' ... 'Q' 'D' 'S']
 ['L' 'E' 'L' ... 'A' 'S' 'K']
 ['N' 'C' 'V' ... 'I' 'A' 'D']
 ...
 ['E' 'I' 'L' ... 'A' 'E' 'A']
 ['T' 'F' 'Y' ... 'L' 'E' 'E']
 ['N' 'D' 'L' ... 'Q' 'D' 'I']]


In [60]:
# Store the seperated amino acids into a dataframe
df1=pd.DataFrame(octamers.T, columns=list('ABCDEFGH'))
print(df1)

      A  B  C  D  E  F  G  H
0     S  L  N  L  R  E  T  N
1     A  E  C  F  R  I  F  D
2     H  L  V  E  A  L  Y  L
3     T  Q  I  M  F  E  T  F
4     A  E  E  L  A  E  I  F
5     P  F  I  F  E  E  E  P
6     P  I  V  G  A  E  T  F
7     E  T  T  A  L  V  C  D
8     G  G  V  Y  A  T  R  S
9     D  A  I  N  T  E  F  K
10    D  D  L  F  F  E  A  D
11    S  F  I  G  M  E  S  A
12    P  T  L  L  T  E  A  P
13    D  Q  I  L  I  E  I  C
14    A  Q  T  F  Y  V  N  L
15    Q  I  T  L  W  Q  R  P
16    G  S  H  L  V  E  A  L
17    R  E  A  F  R  V  F  D
18    Y  E  E  F  V  Q  M  M
19    G  Q  V  N  Y  E  E  F
20    L  P  V  N  G  E  F  S
21    E  L  E  F  P  E  G  G
22    V  E  V  A  E  E  E  E
23    D  T  V  L  E  E  M  S
24    G  D  A  L  L  E  R  N
25    A  A  K  F  E  R  Q  H
26    M  D  S  S  T  S  A  A
27    S  S  N  Y  C  N  Q  M
28    T  P  G  S  R  N  L  C
29    G  S  S  K  Y  P  N  C
...  .. .. .. .. .. .. .. ..
1595  P  S  I  N  N  E  T  P
1596  P  D  I  V  I  Y  Q  Y
1597  W  Y  Q 

In [61]:
df=pd.concat([df1,df],axis=1)
df.drop(columns=['octamers'],inplace=True)
df.head()

Unnamed: 0,A,B,C,D,E,F,G,H,flags
0,S,L,N,L,R,E,T,N,1
1,A,E,C,F,R,I,F,D,1
2,H,L,V,E,A,L,Y,L,1
3,T,Q,I,M,F,E,T,F,1
4,A,E,E,L,A,E,I,F,1


In [62]:
train,test=temp.split(df,0.3)

In [63]:
print("train",train.shape)
print("test",test.shape)

train (1137, 9)
test (488, 9)


In [64]:
# saving datasets into csv filesS
temp.save_csv(test,'test_data.csv')
temp.save_csv(train,'train_data.csv')


In [65]:
# loading training data csv file
train_df = temp.read_file('train_data.csv')
train_df.head()

Unnamed: 0,A,B,C,D,E,F,G,H,flags
0,T,A,S,R,P,S,S,S,0
1,L,K,K,K,K,S,V,T,0
2,T,D,G,S,T,D,Y,G,0
3,I,H,P,I,S,P,I,E,0
4,W,M,V,H,S,L,V,P,0


In [66]:
# splitting training data into train and cross validation dataset 
train_data,cv_data=temp.split(train_df,0.2)

In [67]:
# saving cross validation data into csv file
temp.save_csv(cv_data,'cv_data.csv')

In [68]:
# separating features and labels of training dataset
x_train=train_data.iloc[:,:-1].values
y_train=train_data.iloc[:,8].values

In [69]:
x_train=pd.DataFrame(x_train)

In [70]:
d = defaultdict(LabelEncoder)

# Encoding the variable
fit = x_train.apply(lambda x: d[x.name].fit_transform(x))

# Inverse the encoded
fit.apply(lambda x: d[x.name].inverse_transform(x))

# Using the dictionary to label future data
x_train.apply(lambda x: d[x.name].transform(x))
one_hot_encode = OneHotEncoder()
one_hot_encode.fit(x_train)
x_train=one_hot_encode.transform(x_train).toarray()


In [71]:
print(x_train.shape)

(909, 160)


In [72]:
# feature scaling
sc=StandardScaler()
x_train=sc.fit_transform(x_train)
# test=sc.transform(test)

In [73]:
# fit model

classifier = RandomForestClassifier(n_estimators=20,criterion='entropy',random_state=0)
classifier.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [74]:
y_pred = classifier.predict(x_train)
df = pd.DataFrame({'Actual': y_train, 'Predicted': y_pred})  
df.head()

Unnamed: 0,Actual,Predicted
0,1,1
1,1,1
2,0,0
3,0,0
4,0,0


In [75]:
# loading cross validation dataset file
cv_data = temp.read_file('cv_data.csv')
cv_data.head()

Unnamed: 0,A,B,C,D,E,F,G,H,flags
0,P,E,N,P,Y,N,T,P,0
1,A,S,R,P,S,S,S,R,0
2,R,K,V,E,S,L,Q,E,0
3,A,E,N,R,E,I,L,K,0
4,W,W,T,E,Y,W,Q,A,0


In [76]:
# making confusion matrix
cm= confusion_matrix(y_train,y_pred)
print(cm)

[[694   0]
 [  0 215]]


In [77]:
# separating features and labels of training dataset
x_cv=cv_data.iloc[:,:-1].values
y_cv=cv_data.iloc[:,8].values
x_cv.shape

(228, 8)

In [78]:
x_cv=pd.DataFrame(x_cv)
x_cv.shape

(228, 8)

In [79]:

x_cv=temp.oneHotEncoding(x_cv)

In [80]:
x_cv.shape

(228, 160)

In [81]:
# feature scaling
x_cv=sc.fit_transform(x_cv)

In [82]:
class RandomForest:
    
    def get_predictions(self,x):        
        # getting prediction values
        y_pred = classifier.predict(x)
        return y_pred
    
    def create_confusion_matrix(self,y,y_pred):
        # making confusion matrix
        cm= confusion_matrix(y,y_pred)
        return cm
    
    def get_accuracy(self,y_train,y_pred):
        Accuracy = sklearn.metrics.balanced_accuracy_score(y_train,y_pred)*100
        return Accuracy
        
    def visualize(self,y_pred,x,y):
        # visualizing the training set result
        
        x1,x2=np.meshgrid(np.arange(start=x[:,0].min()-1,stop=x[:,0].max()+1,step=0.01),np.arange(start=x[:,1].min()-1,stop=x[:,1].max()+1,step=0.01 ))
        plt.contourf(x1,x2,classifier.predict(np.array([x1.ravel(),x2.ravel()]).T).reshape(x1.shape),alpha=0.75,cmap=ListedColormap(('red','green')))
        plt.xlim(x1.min(),x1.max())
        plt.ylim(x2.min(),x2.max())
        
        for i,j in enumerate(np.unique(y)):
            plt.scatter(x[y==j,0],x[y==j,1],c=ListedColormap(('red','green'))(i),label=j)
        
        plt.title('predict user will click the ad or not(train dataset)')
        plt.xlabel('Age')
        plt.ylabel('estimated salary')
        plt.show()
        temp
        
def main():
    # creates class object 
    obj = RandomForest()
    y_pred_train = obj.get_predictions(x_train)
    y_pred_test = obj.get_predictions(x_cv)
    
    cm_train=obj.create_confusion_matrix(y_train,y_pred_train)
    print("train matrix\n",cm_train)
    
    cm_cv=obj.create_confusion_matrix(y_cv,y_pred_test)
    print("cv matrix\n",cm_cv)
    
    acc_train = obj.get_accuracy(y_train,y_pred_train)
    print("Accuracy of train data =",acc_train)
    
    acc_test = obj.get_accuracy(y_cv,y_pred_test)
    print("Accuracy of test data =",acc_test)
    
    if acc_train >= 80 and acc_test >=  60:
        fileObject = open("train_data.pkl",'wb')
        pickle.dump(classifier,fileObject)   
        pickle.dump(sc,fileObject)
        # here we close the fileObject
        fileObject.close()

        
if __name__ == '__main__':
    main()

train matrix
 [[694   0]
 [  0 215]]
cv matrix
 [[174   4]
 [ 10  40]]
Accuracy of train data = 100.0
Accuracy of test data = 88.87640449438202
