In [1]:
# This small sample data set contains signature data collected from
# five users.  For each user, there are 20 genuine signatures and
# 20 skilled forgeries.

# Each genuine/forgery signature is stored in a separate text file.
# The file names are in the format "USERx_y.txt", where x (1..5)
# indicates the user and y (1..40) indicates one signature instance
# of the corresponding user, with the first 20 (1..20) representing
# genuine signatures and the rest (21..40) representing skilled
# forgeries provided by the other users. 

# In each text file, the signature is simply represented as a
# sequence of points.  The first line stores a single integer which
# is the total number of points in the signature.  Each of the
# subsequent lines corresponds to one point characterized by seven
# features listed in the following order:

#   X-coordinate  - scaled cursor position along the x-axis
#   Y-coordinate  - scaled cursor position along the y-axis
#   Time stamp    - system time at which the event was posted
#   Button status - current button status (0 for pen-up and
#                   1 for pen-down)
#   Azimuth       - clockwise rotation of cursor about the z-axis
#   Altitude      - angle upward toward the positive z-axis
#   Pressure      - adjusted state of the normal pressure

In [2]:
import pandas as pd
import numpy as np

In [3]:
file_name = ''

In [4]:
list_df = []
list_size = []

## COMBINING ALL THE FILES INTO ONE LIST

In [5]:
%%time
count = 0
for i in range(1,41):
    for j in range(1, 41):
        file_name = 'U'+str(i)+'S'+str(j)+'.txt'
                
        # Reading into a dataframe and appending it to a list
        df = pd.read_csv(file_name, delimiter=' ', names=['X', 'Y', 'TS', 'T', 'AZ', 'AL', 'P'], header=None, skiprows=1)
        list_df.append(df)
        
        # Creating list_size of storing number of rows in each file
        rows, cols = df.shape
        list_size.append(rows)

Wall time: 3.38 s


## CONVERTING INTO A LARGE DATAFRAME

##### ARRAY FORMAT

In [6]:
df_array = np.vstack(list_df)

##### CREATING A DATAFRAME

In [7]:
final_dataset = pd.DataFrame(df_array)
final_dataset.columns = ['X', 'Y', 'TS', 'T', 'AZ', 'AL', 'P']
final_dataset.head()

Unnamed: 0,X,Y,TS,T,AZ,AL,P
0,2933,5678,31275775,0,1550,710,439
1,2933,5678,31275785,1,1480,770,420
2,3001,5851,31275795,1,1350,830,433
3,3114,6139,31275805,1,1350,800,422
4,3217,6198,31275815,1,1350,800,415


## ATTRIBUTES CALCULATION

In [8]:
forgery = []

velocity_array = []
x_velocity_array = []
y_velocity_array = []
avg_pressure = []
avg_x = []
avg_y = []
pen_up = []

In [9]:
distance = 0
index = 0
count = 0
x_distance = 0
y_distance = 0
pressure = 0

temp = 0

In [10]:
%%time

for x in list_size:
    for i in range(x):
        j = i-1
        if j >= 0:
            distance += ((final_dataset['X'][count+j] - final_dataset['X'][count+i])**2 + (final_dataset['Y'][count+j] - final_dataset['Y'][count+i])**2)**0.5
            
            x_distance += abs(final_dataset['X'][count+j] - final_dataset['X'][count+i])
            
            y_distance += abs(final_dataset['Y'][count+j] - final_dataset['Y'][count+i])
            
            time = final_dataset['TS'][count+x-1] - final_dataset['TS'][count]
            
            pressure += final_dataset['P'][count+i]
            
    # pen-up
    pen_up.append(final_dataset['T'][count:count+x].sum())
    
    count += x
    
    
    # velocity calculation
    velocity_array.append(float(distance*1000/time))
    
    # x_velocity calculation
    x_velocity_array.append(float(x_distance*1000/time))
    
    # y_velocity calculation
    y_velocity_array.append(float(y_distance*1000/time))
    
    # avg_pressure calculation
    avg_pressure.append(float(pressure/x))
    
    # avg_x calculation
    avg_x.append(x_distance/x)

    # avg_y calculation
    avg_y.append(y_distance/x)
    
    
    
    
    
    
    
    # initialization
    distance = 0
    x_distance = 0
    y_distance = 0
    time = 0
    pressure = 0

Wall time: 58 s


In [11]:
print('velocity     :', velocity_array[:5], len(velocity_array))

print('x_velocity   :', x_velocity_array[:5], len(x_velocity_array))

print('y_velocity   :', y_velocity_array[:5], len(y_velocity_array))

print('avg_pressure :', avg_pressure[:5], len(avg_pressure))

print('avg_x :', avg_x[:5], len(avg_x))

print('avg_y :', avg_y[:5], len(avg_y))

print('pen-up :', pen_up[:5], len(pen_up))

velocity     : [22203.433517655238, 22930.26429846866, 24527.287240196525, 23457.56298187479, 21381.860569602373] 1600
x_velocity   : [11205.445544554455, 11497.48322147651, 12736.133122028526, 12059.068219633944, 11249.260355029586] 1600
y_velocity   : [16169.14191419142, 16750.0, 17482.56735340729, 16648.086522462563, 15363.905325443788] 1600
avg_pressure : [623.2380952380952, 594.5411764705882, 583.3111111111111, 603.8414634146342, 537.0860215053764] 1600
avg_x : [161.67857142857142, 161.23529411764707, 178.5888888888889, 176.76829268292684, 163.53763440860214] 1600
avg_y : [233.29761904761904, 234.89411764705883, 245.14444444444445, 244.03658536585365, 223.3548387096774] 1600
pen-up : [78, 79, 84, 76, 85] 1600


In [12]:
i=1
while(i<1601):
    for j in range(1,21):
        forgery.append(1)
    for j in range(1,21):
        forgery.append(0)
    i+=40

## COMBINE IT INTO A FEATURE VECTOR

In [13]:
fv_dictionary = {'velocity':velocity_array, 'x_velocity':x_velocity_array, 'y_velocity':y_velocity_array, 'avg_pressure':avg_pressure,
                 'avg_x':avg_x, 'avg_y':avg_y, 'pen_up':pen_up, 'forgery':forgery}

In [14]:
feature_vector = pd.DataFrame(fv_dictionary)

In [15]:
feature_vector.head()

Unnamed: 0,velocity,x_velocity,y_velocity,avg_pressure,avg_x,avg_y,pen_up,forgery
0,22203.433518,11205.445545,16169.141914,623.238095,161.678571,233.297619,78,1
1,22930.264298,11497.483221,16750.0,594.541176,161.235294,234.894118,79,1
2,24527.28724,12736.133122,17482.567353,583.311111,178.588889,245.144444,84,1
3,23457.562982,12059.06822,16648.086522,603.841463,176.768293,244.036585,76,1
4,21381.86057,11249.260355,15363.905325,537.086022,163.537634,223.354839,85,1


In [16]:
feature_vector.to_csv('feature_vector_original.csv', index=False)

## NORMALIZE THE FEATURE VECTOR

In [17]:
feature_vector = ((feature_vector - feature_vector.min())/(feature_vector.max() - feature_vector.min()))*3

In [18]:
feature_vector.head()

Unnamed: 0,velocity,x_velocity,y_velocity,avg_pressure,avg_x,avg_y,pen_up,forgery
0,2.468348,2.207477,2.683657,1.831666,2.362836,2.839467,0.014925,3.0
1,2.560082,2.273419,2.79151,1.728427,2.355562,2.8611,0.018657,3.0
2,2.761645,2.553106,2.927532,1.688027,2.640318,3.0,0.037313,3.0
3,2.626633,2.400225,2.772587,1.761885,2.610444,2.984988,0.007463,3.0
4,2.364656,2.21737,2.534142,1.52173,2.393341,2.704735,0.041045,3.0


In [19]:
feature_vector.to_csv('feature_vector_normalized.csv', index=False)

## TRAIN DATA

In [20]:
train_data_count = int(0.75*1600)
train_df = feature_vector[:train_data_count]

## TEST DATA

In [21]:
test_data_count = int(0.25*1600)
test_df = feature_vector[:test_data_count]

In [22]:
x = train_df[['velocity', 'x_velocity', 'y_velocity', 'avg_pressure', 'avg_x', 'avg_y', 'pen_up']]
x.head()

Unnamed: 0,velocity,x_velocity,y_velocity,avg_pressure,avg_x,avg_y,pen_up
0,2.468348,2.207477,2.683657,1.831666,2.362836,2.839467,0.014925
1,2.560082,2.273419,2.79151,1.728427,2.355562,2.8611,0.018657
2,2.761645,2.553106,2.927532,1.688027,2.640318,3.0,0.037313
3,2.626633,2.400225,2.772587,1.761885,2.610444,2.984988,0.007463
4,2.364656,2.21737,2.534142,1.52173,2.393341,2.704735,0.041045


In [23]:
y = train_df['forgery']
y.head()

0    3.0
1    3.0
2    3.0
3    3.0
4    3.0
Name: forgery, dtype: float64

In [24]:
print(x.shape, y.shape)

(1200, 7) (1200,)


# TRAINING AND TESTING MODEL

# K NEIGHBOURS ALGORITHM

## KNN 1

In [25]:
from sklearn.neighbors import KNeighborsClassifier

In [26]:
knn = KNeighborsClassifier(n_neighbors=1)

In [27]:
knn.fit(x,y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform')

In [29]:
result = knn.predict((test_df[['velocity', 'x_velocity', 'y_velocity', 'avg_pressure', 'avg_x', 'avg_y', 'pen_up']]).to_numpy())

error_count = 0

for i in range(len(result)):
    if result[i]!=test_df['forgery'][j]:
        error_count += 1
        
accuracy_knn_1 = (test_data_count - error_count) / test_data_count

print('Accuracy of the model using knn 1 neighbour: ', accuracy_knn_1, error_count)

Accuracy of the model using knn 1 neighbour:  0.5 200


## KNN 5

In [31]:
knn5 = KNeighborsClassifier(n_neighbors=5)

In [32]:
knn5.fit(x,y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [33]:
result = knn5.predict((test_df[['velocity', 'x_velocity', 'y_velocity', 'avg_pressure', 'avg_x', 'avg_y', 'pen_up']]).to_numpy())

error_count = 0

for i in range(len(result)):
    if result[i]!=test_df['forgery'][j]:
        error_count += 1
        
accuracy_knn_5 = (test_data_count - error_count) / test_data_count

print('Accuracy of the model using knn 1 neighbour: ', accuracy_knn_5, error_count)

Accuracy of the model using knn 1 neighbour:  0.455 218


# LOGISTIC REGRESSION

In [34]:
from sklearn.linear_model import LogisticRegression

In [35]:
logreg = LogisticRegression()

In [36]:
logreg.fit(x,y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [37]:
result_logi = logreg.predict((test_df[['velocity', 'x_velocity', 'y_velocity', 'avg_pressure', 'avg_x', 'avg_y', 'pen_up']]).to_numpy())

In [38]:
error_count = 0

for i in range(len(result_logi)):
    if result_logi[i]!=test_df['forgery'][j]:
        error_count += 1
#         print(result[i], test_df['forgery'][i])
        
accuracy_logi = (test_data_count - error_count) / test_data_count

print('Accuracy of the model using logistic regresion: ', accuracy_logi, error_count)

Accuracy of the model using logistic regresion:  0.3875 245
