# BANK NOTES CLASSIFICATION

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# LOADING DATASET

In [2]:
df = pd.read_csv("data_banknote_authentication.txt")

In [3]:
df.head()

Unnamed: 0,3.6216,8.6661,-2.8073,-0.44699,0
0,4.5459,8.1674,-2.4586,-1.4621,0
1,3.866,-2.6383,1.9242,0.10645,0
2,3.4566,9.5228,-4.0112,-3.5944,0
3,0.32924,-4.4552,4.5718,-0.9888,0
4,4.3684,9.6718,-3.9606,-3.1625,0


# EDA

In [4]:
# here data is not in regular format so edit it and make it more meaningfull
df.columns=["variance","skewness","kurtosis","entropy","class"]

In [5]:
df.head()

Unnamed: 0,variance,skewness,kurtosis,entropy,class
0,4.5459,8.1674,-2.4586,-1.4621,0
1,3.866,-2.6383,1.9242,0.10645,0
2,3.4566,9.5228,-4.0112,-3.5944,0
3,0.32924,-4.4552,4.5718,-0.9888,0
4,4.3684,9.6718,-3.9606,-3.1625,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371 entries, 0 to 1370
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   variance  1371 non-null   float64
 1   skewness  1371 non-null   float64
 2   kurtosis  1371 non-null   float64
 3   entropy   1371 non-null   float64
 4   class     1371 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 53.7 KB


In [7]:
df.describe()

Unnamed: 0,variance,skewness,kurtosis,entropy,class
count,1371.0,1371.0,1371.0,1371.0,1371.0
mean,0.43141,1.917434,1.400694,-1.1922,0.444931
std,2.842494,5.868359,4.310105,2.101683,0.497139
min,-7.0421,-13.7731,-5.2861,-8.5482,0.0
25%,-1.7747,-1.7113,-1.55335,-2.417,0.0
50%,0.49571,2.3134,0.61663,-0.58665,0.0
75%,2.81465,6.8131,3.1816,0.39481,1.0
max,6.8248,12.9516,17.9274,2.4495,1.0


In [8]:
df.corr()

Unnamed: 0,variance,skewness,kurtosis,entropy,class
variance,1.0,0.263333,-0.380358,0.276666,-0.724655
skewness,0.263333,1.0,-0.786729,-0.526896,-0.444281
kurtosis,-0.380358,-0.786729,1.0,0.319219,0.155346
entropy,0.276666,-0.526896,0.319219,1.0,-0.0232
class,-0.724655,-0.444281,0.155346,-0.0232,1.0


In [9]:
df.drop_duplicates(inplace=True)

In [10]:
df.isnull().sum()

variance    0
skewness    0
kurtosis    0
entropy     0
class       0
dtype: int64

In [11]:
# We can see that ther are no missing values. All features are of numerical values, so no need for data conversions.


In [12]:
df["class"].value_counts()

0    737
1    610
Name: class, dtype: int64

In [13]:
# We can see that there are 737 forged notes and 610 original ones

In [14]:
#split the data
x=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [15]:
x

Unnamed: 0,variance,skewness,kurtosis,entropy
0,4.54590,8.16740,-2.4586,-1.46210
1,3.86600,-2.63830,1.9242,0.10645
2,3.45660,9.52280,-4.0112,-3.59440
3,0.32924,-4.45520,4.5718,-0.98880
4,4.36840,9.67180,-3.9606,-3.16250
...,...,...,...,...
1366,0.40614,1.34920,-1.4501,-0.55949
1367,-1.38870,-4.87730,6.4774,0.34179
1368,-3.75030,-13.45860,17.5932,-2.77710
1369,-3.56370,-8.38270,12.3930,-1.28230


In [16]:
y

0       0
1       0
2       0
3       0
4       0
       ..
1366    1
1367    1
1368    1
1369    1
1370    1
Name: class, Length: 1347, dtype: int64

In [17]:
x.shape

(1347, 4)

In [18]:
y.shape

(1347,)

# MODEL BUILDING

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

# MODEL BUILDING USING LGISTIC REGRESSION

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
from sklearn.metrics import confusion_matrix

In [34]:
LR = LogisticRegression()

In [35]:
LR.fit(x_train, y_train)

LogisticRegression()

In [36]:
#prediction

In [37]:
y_pred_LR = LR.predict(x_test)

In [38]:
r2_score(y_test, y_pred_LR)

0.9761854285916189

In [40]:
confusion_matrix(y_test, y_pred_LR)

array([[176,   2],
       [  0, 159]], dtype=int64)