**Importing all the necessary Libraries**

In [4]:
import pandas as pd
import numpy as np

**Create a dataframe of the dataset**

In [5]:
df=pd.read_csv(r"C:\Users\omen\Documents\PythonCodes\Sem 6\Tests\Creditcard_data.csv")
X=df.drop(columns=['Class'])
Y=df['Class']
print(X.shape)
print(Y.shape)

(772, 30)
(772,)


**Checking Whether there are any null values or not**

In [6]:
df.isnull().values.any()

False

**Checking the balance of the dataset**

In [7]:
n1 = df[df['Class']==1].count().max()
n0 = df[df['Class']==0].count().max()
print(n1," objects belong to class 1")
print(n0," objects belong to class 0")

9  objects belong to class 1
763  objects belong to class 0


*Here, the dataset is imbalanced because there are only 9 data objects belonging to class 1 while the rest 763 belong to class 0*

**Splitting the dataset into two different sets of the individual classes**

In [8]:
fraud = df[df['Class']==1]
normal = df[df['Class']==0]

*To fix this imbalance, we use the techniques of undersampling and oversampling to create a balanced dataset*

In [9]:
# Undersampling

from imblearn.under_sampling import NearMiss

nm = NearMiss()
X1_res,Y1_res = nm.fit_resample(X,Y)

In [10]:
# Oversampling

from imblearn.over_sampling import RandomOverSampler

ROS = RandomOverSampler()
X2_res,Y2_res = ROS.fit_resample(X,Y)

In [11]:
# Error = 0.05 for simple random sampling
n = (1.96*1.96*0.5*0.5)//(0.05**2)

In [12]:
df2=X2_res
df2['Class']=Y2_res
df2

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1521,118,1.254914,0.350287,0.302488,0.693114,-0.371470,-1.070256,0.086781,-0.202836,0.035154,...,-0.287592,-0.832682,0.128083,0.339427,0.215944,0.094704,-0.023354,0.030892,2.69,1
1522,539,-1.738582,0.052740,1.187057,-0.656652,0.920623,-0.291788,0.269083,0.140631,0.023464,...,-0.179545,-0.192036,-0.261879,-0.237477,-0.335040,0.240323,-0.345129,-0.383563,1.00,1
1523,484,-0.928088,0.398194,1.741131,0.182673,0.966387,-0.901004,0.879016,-0.156590,-0.142117,...,0.066353,0.281378,-0.257966,0.385384,0.391117,-0.453853,-0.104448,-0.125765,1.00,1
1524,574,1.257719,0.364739,0.306923,0.690638,-0.357792,-1.067481,0.094272,-0.210300,0.014455,...,-0.286856,-0.820658,0.127663,0.343128,0.221120,0.094391,-0.022189,0.030944,1.29,1


**Simple Random Sampling**

In [13]:
s1 = df2.sample(n=int(n), random_state=42)
s1

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
1439,539,-1.738582,0.052740,1.187057,-0.656652,0.920623,-0.291788,0.269083,0.140631,0.023464,...,-0.179545,-0.192036,-0.261879,-0.237477,-0.335040,0.240323,-0.345129,-0.383563,1.00,1
76,49,-0.549626,0.418949,1.729833,0.203065,-0.187012,0.253878,0.500894,0.251256,-0.227985,...,0.115062,0.418529,-0.065133,0.264981,0.003958,0.395969,0.027182,0.043506,59.99,0
1010,164,0.073497,0.551033,0.451890,0.114964,0.822947,0.251480,0.296319,0.139497,-0.123050,...,-0.128758,-0.381932,0.151012,-1.363967,-1.389079,0.075412,0.231750,0.230171,0.99,1
660,499,1.255439,0.307729,0.292700,0.699873,-0.428876,-1.088456,0.043840,-0.167739,0.128854,...,-0.294795,-0.882126,0.136846,0.327949,0.194459,0.096516,-0.027271,0.029491,1.98,0
1132,574,1.257719,0.364739,0.306923,0.690638,-0.357792,-1.067481,0.094272,-0.210300,0.014455,...,-0.286856,-0.820658,0.127663,0.343128,0.221120,0.094391,-0.022189,0.030944,1.29,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1481,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,1
756,564,-0.203837,0.532747,-0.339857,-0.730934,2.728163,3.535882,0.263680,0.919169,-0.194501,...,-0.082087,-0.271636,-0.157778,0.989458,0.228821,-0.545156,0.058120,0.035573,12.90,0
1074,484,-0.928088,0.398194,1.741131,0.182673,0.966387,-0.901004,0.879016,-0.156590,-0.142117,...,0.066353,0.281378,-0.257966,0.385384,0.391117,-0.453853,-0.104448,-0.125765,1.00,1
867,472,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,...,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,529.00,1


**Bootstrap Sampling**

In [14]:
# Oversampling
s2 = df2.sample(n=int(n), replace=True, random_state=42)

s2


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
1126,118,1.254914,0.350287,0.302488,0.693114,-0.371470,-1.070256,0.086781,-0.202836,0.035154,...,-0.287592,-0.832682,0.128083,0.339427,0.215944,0.094704,-0.023354,0.030892,2.69,1
1459,529,-2.000567,-2.495484,2.467149,1.140053,2.462010,0.594262,-2.110183,0.788347,0.958809,...,0.422452,1.195394,0.297836,-0.857105,-0.219322,0.861019,-0.124622,-0.171060,1.50,1
860,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,1
1294,539,-1.738582,0.052740,1.187057,-0.656652,0.920623,-0.291788,0.269083,0.140631,0.023464,...,-0.179545,-0.192036,-0.261879,-0.237477,-0.335040,0.240323,-0.345129,-0.383563,1.00,1
1130,406,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.177840,0.261145,-0.143276,0.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
565,423,1.174846,-0.097105,-0.361083,0.914490,0.594571,1.106664,-0.114835,0.307650,0.347846,...,-0.157769,-0.388798,-0.303408,-1.737288,0.805329,-0.221755,0.018558,-0.008570,43.34,0
569,426,-0.424126,0.943262,1.133354,-0.166338,0.387243,-0.030382,0.466045,0.275041,-0.527932,...,-0.211158,-0.582126,-0.021227,-0.379104,-0.241854,0.106796,0.251811,0.080462,4.95,0
1346,472,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,...,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,529.00,1
685,517,-0.639474,-0.048355,2.452755,0.310804,-0.430963,-0.290032,0.166889,0.006196,0.651675,...,0.004189,0.110847,0.057008,0.389171,-0.449642,0.218186,-0.067664,-0.073760,59.90,0


**Stratified Sampling**

In [15]:
z=1.96
p=0.5
E=0.05
S=0.7
sample_size=round((z**2*p*(1-p))/((E/S)**2))
s3=df2.groupby('Class', group_keys=False).apply(lambda x: x.sample(sample_size))
s3['Class'].value_counts()
s3

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
60,41,0.986063,-0.202965,-0.492768,0.407691,0.305660,-0.230529,0.585028,-0.208225,-0.247503,...,-0.305874,-1.216555,-0.077602,-0.741341,0.286881,0.200347,-0.075203,0.027271,169.05,0
596,448,-0.242497,0.891170,1.164787,1.025392,0.729123,0.663501,0.760560,-0.004531,0.037370,...,-0.161343,0.047255,-0.302695,-0.931315,-0.008611,-0.187111,0.259246,-0.030635,11.99,0
362,266,-2.564961,2.470985,2.649417,-1.564256,1.794297,-0.614742,4.185906,-3.855359,5.436633,...,-1.672706,-0.463149,-0.532466,0.306494,0.226844,-0.365416,-0.936735,-2.733887,10.35,0
615,463,-0.634597,0.866354,1.123836,-0.304041,1.173910,0.257118,0.754325,0.171735,-0.825981,...,0.160463,0.422873,-0.353721,-0.847192,0.206896,-0.436112,0.110134,0.102261,1.00,0
500,368,-0.409900,1.183088,1.598967,0.353088,0.309710,-0.312400,0.707197,-0.043206,-0.892869,...,-0.163371,-0.396155,-0.069498,0.069735,-0.298407,0.199188,0.099692,0.118617,1.98,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1126,118,1.254914,0.350287,0.302488,0.693114,-0.371470,-1.070256,0.086781,-0.202836,0.035154,...,-0.287592,-0.832682,0.128083,0.339427,0.215944,0.094704,-0.023354,0.030892,2.69,1
1072,118,1.254914,0.350287,0.302488,0.693114,-0.371470,-1.070256,0.086781,-0.202836,0.035154,...,-0.287592,-0.832682,0.128083,0.339427,0.215944,0.094704,-0.023354,0.030892,2.69,1
1507,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,1
1362,529,-2.000567,-2.495484,2.467149,1.140053,2.462010,0.594262,-2.110183,0.788347,0.958809,...,0.422452,1.195394,0.297836,-0.857105,-0.219322,0.861019,-0.124622,-0.171060,1.50,1


**Systematic Sampling**

In [16]:
n=len(df)
k=int(n**0.5)
s4=df2.iloc[::k]
s4

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
27,23,1.322707,-0.174041,0.434555,0.576038,-0.836758,-0.831083,-0.264905,-0.220982,-1.071425,...,-0.284376,-0.323357,-0.03771,0.347151,0.559639,-0.280158,0.042335,0.028822,16.0,0
54,37,1.295668,0.341483,0.081505,0.566746,-0.110459,-0.766325,0.073155,-0.168304,0.071837,...,-0.323607,-0.929781,0.063809,-0.193565,0.287574,0.127881,-0.023731,0.0252,0.99,0
81,52,1.147369,0.059035,0.263632,1.211023,-0.044096,0.301067,-0.13296,0.227885,0.252191,...,-0.087813,-0.110756,-0.097771,-0.323374,0.633279,-0.305328,0.027394,-0.00058,6.67,0
108,73,1.162281,1.248178,-1.581317,1.475024,1.138357,-1.020373,0.638387,-0.136762,-0.805505,...,-0.124012,-0.22715,-0.199185,-0.289757,0.776244,-0.28395,0.056747,0.084706,1.0,0
135,84,1.119272,-0.669639,0.803807,-0.651693,-1.395666,-0.800698,-0.601605,0.01439,2.019905,...,0.163687,0.546516,-0.176836,0.402556,0.563402,-0.534236,0.075047,0.042001,67.3,0
162,103,-0.940893,1.074155,1.759398,-0.601446,0.101693,-0.18852,0.455756,-3.460682,0.441525,...,2.270069,-0.143518,0.153908,0.700927,-0.413235,1.374031,-0.996161,-0.836301,9.99,0
189,124,-1.710935,-1.366799,2.217311,0.404714,-0.114375,-0.075942,-0.259943,0.320897,-0.175355,...,0.390634,0.481111,0.405839,0.066433,0.156732,1.286201,-0.093975,0.098826,230.0,0
216,142,1.288256,0.085828,-1.179482,0.064357,2.195225,3.383363,-0.448437,0.799347,-0.147006,...,0.017485,-0.051355,-0.14548,1.007613,0.833293,-0.265485,0.020539,0.015394,4.9,0
243,164,-0.433211,1.020835,2.01973,3.003261,0.031308,0.187063,0.850856,-0.143932,-0.918043,...,-0.177298,-0.18026,0.00776,0.382658,-0.187193,0.100067,0.204039,-0.01815,65.26,0


**Cluster Sampling**

In [17]:
z=1.96
p=0.5
E=0.05
C=1.7
sample_size=round((z**2*p*(1-p))/((E/C)**2))
clusters=2
df_new=df2
N = len(df2)
K = int(N/sample_size)
data = None
for k in range(K):
    sample_k = df_new.sample(sample_size)
    sample_k["cluster"] = np.repeat(k,len(sample_k))
    df_new = df_new.drop(index = sample_k.index)
    data = pd.concat([data,sample_k],axis = 0)

random_chosen_clusters = np.random.randint(0,K,size = clusters)
s5 = data[data.cluster.isin(random_chosen_clusters)]
s5.drop(['cluster'], axis=1, inplace=True)
s5

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
1463,118,1.254914,0.350287,0.302488,0.693114,-0.371470,-1.070256,0.086781,-0.202836,0.035154,...,-0.287592,-0.832682,0.128083,0.339427,0.215944,0.094704,-0.023354,0.030892,2.69,1
753,562,-0.312911,0.815136,1.405270,0.070527,0.058558,-1.030855,0.760191,-0.182238,0.037325,...,-0.326542,-0.821322,-0.000188,0.276734,-0.243046,0.049775,0.064619,-0.126739,10.99,0
597,449,-0.856525,0.583290,1.389014,-0.344699,0.267594,-0.951375,0.523117,-0.049229,-0.076944,...,-0.056911,-0.309940,0.167010,0.359246,-0.969651,-0.185793,-0.136897,0.233672,10.20,0
407,294,-1.097477,1.246236,0.464855,-0.178196,0.249499,-0.725058,1.197760,-0.358585,-0.153112,...,-0.050148,0.114628,-0.188974,-0.066727,-0.160964,0.275263,0.117647,0.272015,73.00,0
1161,118,1.254914,0.350287,0.302488,0.693114,-0.371470,-1.070256,0.086781,-0.202836,0.035154,...,-0.287592,-0.832682,0.128083,0.339427,0.215944,0.094704,-0.023354,0.030892,2.69,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1320,472,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,...,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,529.00,1
107,71,1.331897,-0.579962,0.422606,-0.897752,-0.746254,-0.056273,-0.750317,0.128484,-0.964682,...,-0.039070,-0.255174,0.109333,-0.328448,0.162254,-0.447276,0.020071,0.006231,14.48,0
674,510,1.163271,0.141760,0.124579,0.958551,-0.159554,-0.461529,0.090759,-0.023257,-0.125187,...,0.066320,0.089322,-0.169921,-0.012282,0.660756,-0.325616,-0.002210,0.010419,35.93,0
735,552,1.238533,-0.011075,0.332073,0.086029,-0.449032,-0.653983,-0.114230,-0.056522,0.304679,...,-0.078097,-0.206754,0.069282,0.117772,0.196515,0.760195,-0.054258,0.003194,5.95,0


In [18]:
samples=[]
samples.append(s1)
samples.append(s2)
samples.append(s3)
samples.append(s4)
samples.append(s5)

In [19]:
sol=pd.DataFrame(columns=['Simple-Random','Bootstrap','Stratified','Systematic','Cluster'], index=['Logistic Regression','Decision Tree','Random Forest','Naive Bayes','SVM'])

**Logistic Regression**

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [21]:
from sklearn.linear_model import LogisticRegression

for i in range(5):
    x_s=samples[i].drop('Class',axis=1)
    y_s=samples[i]['Class']
    xtrain, xtest, y_train, y_test = train_test_split(x_s ,y_s , random_state=104,test_size=0.25, shuffle=True)

    model = LogisticRegression(random_state = 0,max_iter=2000)
    model.fit(xtrain, y_train)
    y_pred = model.predict(xtest)
    acc = accuracy_score(y_test, y_pred)
    sol.iloc[0,i]=acc*100


**Decision Trees**

In [22]:
from sklearn.tree import DecisionTreeClassifier

for i in range(5):
    x_s=samples[i].drop('Class',axis=1)
    y_s=samples[i]['Class']
    xtrain, xtest, y_train, y_test = train_test_split(x_s ,y_s , random_state=104,test_size=0.25, shuffle=True)
    
    model = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    model.fit(xtrain, y_train)
    y_pred = model.predict(xtest)
    acc = accuracy_score(y_test, y_pred)
    sol.iloc[1,i]=acc*100

**Random Forest**

In [23]:
from sklearn.ensemble import RandomForestClassifier

for i in range(5):
    x_s=samples[i].drop('Class',axis=1)
    y_s=samples[i]['Class']
    xtrain, xtest, y_train, y_test = train_test_split(x_s ,y_s , random_state=104,test_size=0.25, shuffle=True)

    model = RandomForestClassifier(n_estimators = 100)
    model.fit(xtrain, y_train)
    y_pred = model.predict(xtest)
    acc = accuracy_score(y_test, y_pred)
    sol.iloc[2,i]=acc*100

**Naive Bayes**

In [24]:
from sklearn.naive_bayes import GaussianNB

for i in range(5):
    x_s=samples[i].drop('Class',axis=1)
    y_s=samples[i]['Class']
    xtrain, xtest, y_train, y_test = train_test_split(x_s ,y_s , random_state=104,test_size=0.25, shuffle=True)

    model = GaussianNB()
    model.fit(xtrain,y_train)
    y_pred= model.predict(xtest)
    acc = accuracy_score(y_test, y_pred)
    sol.iloc[3,i]=acc*100

**Support Vector Machines**

In [25]:
from sklearn.svm import SVC

for i in range(5):
    x_s=samples[i].drop('Class',axis=1)
    y_s=samples[i]['Class']
    xtrain, xtest, y_train, y_test = train_test_split(x_s ,y_s , random_state=104,test_size=0.25, shuffle=True)

    model = SVC(kernel='rbf')
    model.fit(xtrain, y_train)
    y_pred=model.predict(xtest)
    acc = accuracy_score(y_test, y_pred)
    sol.iloc[4,i]=acc*100

In [26]:
print(sol)

                    Simple-Random  Bootstrap Stratified Systematic    Cluster
Logistic Regression     86.458333  92.708333  93.617021  93.333333  92.086331
Decision Tree           94.791667  97.916667   97.87234  73.333333  98.920863
Random Forest               100.0      100.0   98.93617  86.666667      100.0
Naive Bayes             70.833333  72.916667  84.042553  46.666667  76.978417
SVM                        65.625     78.125  72.340426  53.333333  71.582734


In [27]:
def highlight_max(s):
    is_max = s == s.max()
    return ['background-color: blue' if v else '' for v in is_max]
sol.style.apply(highlight_max, axis=1)

Unnamed: 0,Simple-Random,Bootstrap,Stratified,Systematic,Cluster
Logistic Regression,86.458333,92.708333,93.617021,93.333333,92.086331
Decision Tree,94.791667,97.916667,97.87234,73.333333,98.920863
Random Forest,100.0,100.0,98.93617,86.666667,100.0
Naive Bayes,70.833333,72.916667,84.042553,46.666667,76.978417
SVM,65.625,78.125,72.340426,53.333333,71.582734


*Random Forest has the best overall performance*

**Shapiro-Wilk Test**

*This test is used to check the goodness of a sample*

In [28]:
from scipy.stats import shapiro
for i in range(5):
  print(shapiro(samples[i]))

ShapiroResult(statistic=0.17280685901641846, pvalue=0.0)
ShapiroResult(statistic=0.17152827978134155, pvalue=0.0)
ShapiroResult(statistic=0.16854548454284668, pvalue=0.0)
ShapiroResult(statistic=0.17053323984146118, pvalue=0.0)
ShapiroResult(statistic=0.16387814283370972, pvalue=0.0)


