In [None]:
import pandas as pd
import numpy as np
import random
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score


In [None]:

data_frame=pd.read_csv("Creditcard_data.csv")


In [None]:

#Oversampling
over_sampling = SMOTE(sampling_strategy='minority',random_state=42)
x=data_frame.iloc[:,:-1]
y=data_frame.iloc[:,-1]
x_sampled,y_sampled=over_sampling.fit_resample(x,y)
data_frame2=pd.concat([x_sampled,y_sampled], axis=1)


In [None]:

#Simple Random Sampling
simpleRandomSampling=(1.96**2)*0.5*(1-0.5)/(0.05**2)
data_simpleRandomSampling=(data_frame2.sample(int(simpleRandomSampling),random_state=42))

x_train, x_test, y_train, y_test=train_test_split(data_simpleRandomSampling.iloc[:,:-1],data_simpleRandomSampling.iloc[:,-1],test_size=0.2,random_state=42)
model_1 = LogisticRegression(max_iter= 2500,random_state=42)
model_2 = DecisionTreeClassifier(random_state=42)
model_3 = ExtraTreesClassifier()
model_4 = AdaBoostClassifier()
model_5 = RandomForestClassifier()

model_1.fit(x_train,y_train)
model_2.fit(x_train,y_train)
model_3.fit(x_train,y_train)
model_4.fit(x_train,y_train)
model_5.fit(x_train,y_train)

y_pred1 = model_1.predict(x_test)
y_pred2 = model_2.predict(x_test)
y_pred3 = model_3.predict(x_test)
y_pred4 = model_4.predict(x_test)
y_pred5 = model_5.predict(x_test)

final_res=[[accuracy_score(y_test, y_pred1),accuracy_score(y_test, y_pred2),accuracy_score(y_test,y_pred3),accuracy_score(y_test,y_pred4),accuracy_score(y_test,y_pred5)]]


In [15]:

#Systematic Random Sampling
data_frame_systematic=data_frame2.iloc[[i for i in range(5,1000,2)],:]

x_train, x_test, y_train, y_test=train_test_split(data_frame_systematic.iloc[:,:-1],data_frame_systematic.iloc[:,-1],test_size=0.2,random_state=42)
model_1 = LogisticRegression(max_iter= 2500,random_state=42)
model_2 = DecisionTreeClassifier(random_state=42)
model_3 = ExtraTreesClassifier()
model_4 = AdaBoostClassifier()
model_5 = RandomForestClassifier()

model_1.fit(x_train,y_train)
model_2.fit(x_train,y_train)
model_3.fit(x_train,y_train)
model_4.fit(x_train,y_train)
model_5.fit(x_train,y_train)

y_pred1 = model_1.predict(x_test)
y_pred2 = model_2.predict(x_test)
y_pred3 = model_3.predict(x_test)
y_pred4 = model_4.predict(x_test)
y_pred5 = model_5.predict(x_test)

res=[accuracy_score(y_test, y_pred1),accuracy_score(y_test, y_pred2),accuracy_score(y_test,y_pred3),accuracy_score(y_test,y_pred4),accuracy_score(y_test,y_pred5)]
final_res.append(res)


In [16]:

#stratified sampling
sample_StratifiedSampling=(1.96**2)*0.3*(1-0.3)/((0.05/2)**2)
data_frame_stratified=data_frame2.groupby('Class', group_keys=False).apply(lambda x: x.sample(int(sample_StratifiedSampling/2),random_state=42))
x_train, x_test, y_train, y_test=train_test_split(data_frame_stratified.iloc[:,:-1],data_frame_stratified.iloc[:,-1],test_size=0.2,random_state=42)
model_1 = LogisticRegression(max_iter= 2500,random_state=42)
model_2 = DecisionTreeClassifier(random_state=42)
model_3 = ExtraTreesClassifier()
model_4 = AdaBoostClassifier()
model_5 = RandomForestClassifier()

model_1.fit(x_train,y_train)
model_2.fit(x_train,y_train)
model_3.fit(x_train,y_train)
model_4.fit(x_train,y_train)
model_5.fit(x_train,y_train)

y_pred1 = model_1.predict(x_test)
y_pred2 = model_2.predict(x_test)
y_pred3 = model_3.predict(x_test)
y_pred4 = model_4.predict(x_test)
y_pred5 = model_5.predict(x_test)

res=[accuracy_score(y_test, y_pred1),accuracy_score(y_test, y_pred2),accuracy_score(y_test,y_pred3),accuracy_score(y_test,y_pred4),accuracy_score(y_test,y_pred5)]
final_res.append(res)


In [17]:

#cluster sampling
sample_ClusterSampling=(1.96**2)*0.1*(1-0.1)/((0.05/3)**2)
s=set(list(data_frame2['Time']))
s1=pd.Series(list(s))
data_frame_clustered=(data_frame2[data_frame2['Time'].isin([ i for i in s1.sample(int(sample_ClusterSampling/3),random_state=42)])])

x_train, x_test, y_train, y_test=train_test_split(data_frame_clustered.iloc[:,:-1],data_frame_clustered.iloc[:,-1],test_size=0.2,random_state=42)
model_1 = LogisticRegression(max_iter= 2500,random_state=42)
model_2 = DecisionTreeClassifier(random_state=42)
model_3 = ExtraTreesClassifier()
model_4 = AdaBoostClassifier()
model_5 = RandomForestClassifier()

model_1.fit(x_train,y_train)
model_2.fit(x_train,y_train)
model_3.fit(x_train,y_train)
model_4.fit(x_train,y_train)
model_5.fit(x_train,y_train)

y_pred1 = model_1.predict(x_test)
y_pred2 = model_2.predict(x_test)
y_pred3 = model_3.predict(x_test)
y_pred4 = model_4.predict(x_test)
y_pred5 = model_5.predict(x_test)

res=[accuracy_score(y_test, y_pred1),accuracy_score(y_test, y_pred2),accuracy_score(y_test,y_pred3),accuracy_score(y_test,y_pred4),accuracy_score(y_test,y_pred5)]
final_res.append(res)



In [18]:

#Quota Sampling
data_frame_only0=data_frame2[data_frame2['Class']==0].iloc[:500]
data_frame_only1=data_frame2[data_frame2['Class']==1].iloc[:500]
data_frame_quotasampling =pd.concat([data_frame_only0 ,data_frame_only1], axis=0)

x_train, x_test, y_train, y_test=train_test_split(data_frame_quotasampling.iloc[:,:-1],data_frame_quotasampling.iloc[:,-1],test_size=0.2,random_state=42)
model_1 = LogisticRegression(max_iter= 2500,random_state=42)
model_2 = DecisionTreeClassifier(random_state=42)
model_3 = ExtraTreesClassifier()
model_4 = AdaBoostClassifier()
model_5 = RandomForestClassifier()

model_1.fit(x_train,y_train)
model_2.fit(x_train,y_train)
model_3.fit(x_train,y_train)
model_4.fit(x_train,y_train)
model_5.fit(x_train,y_train)

y_pred1 = model_1.predict(x_test)
y_pred2 = model_2.predict(x_test)
y_pred3 = model_3.predict(x_test)
y_pred4 = model_4.predict(x_test)
y_pred5 = model_5.predict(x_test)

res=[accuracy_score(y_test, y_pred1),accuracy_score(y_test, y_pred2),accuracy_score(y_test,y_pred3),accuracy_score(y_test,y_pred4),accuracy_score(y_test,y_pred5)]
final_res.append(res)


In [21]:
res_table=pd.DataFrame(final_res)
res_table = res_table.rename(columns={0: 'LogReg', 1: 'Decision Tree', 2: 'ExtraTrees', 3:'AdaBoost',4: 'Random Forest'})
res_table = res_table.rename(index={0: 'Sample1', 1: 'Sample2', 2: 'Sample3',3: 'Sample4',4: 'Sample5'})
print(res_table)


           LogReg  Decision Tree  ExtraTrees  AdaBoost  Random Forest
Sample1  0.883117       0.961039    0.961039  0.935065       0.974026
Sample2  0.910000       0.950000    0.990000  1.000000       0.950000
Sample3  0.922481       0.965116    0.996124  0.996124       0.992248
Sample4  0.906122       0.975510    0.991837  0.987755       0.995918
Sample5  0.915000       0.970000    0.995000  0.970000       1.000000


In [22]:

max_accuracy=-1
sampling=""
model=""
for i in range(len(final_res)):
    for j in range(len(final_res[0])):
        if (final_res[i][j]>max_accuracy):          
            if i==0:
                sampling="Simple Random Sampling"
            elif i==1:
                sampling="Systematic Random Sampling"
            elif i==2:
                sampling="Stratified sampling"
            elif i==3:
                sampling="Cluster Sampling"
            elif i==4:
                sampling="Quota Sampling"
            if j==0:
                model="Logistic Regression"
            elif j==1:
                model="Decision Tree Classifier"
            elif j==2:
                model="ExtraTrees Classifier"
            elif j==3:
                model="AdaBoost Classifier"
            elif j==4:
                model="Random Forest Classifier"                    
            max_accuracy=final_res[i][j]

print(f'{model} method with {sampling} gives the highest accuracy {max_accuracy}') 

AdaBoost Classifier method with Systematic Random Sampling gives the highest accuracy 1.0
