In [None]:
import pandas as pd
from google.colab import files
import io
from sklearn.preprocessing import StandardScaler

In [None]:
uploaded = files.upload()

Saving traindata.csv to traindata.csv


In [None]:
data = io.BytesIO(uploaded["traindata.csv"])
df = pd.read_csv(data, sep = None, engine = 'python')
df

Unnamed: 0,id,x1,x2,x3,y
0,1,60,64,0,1
1,2,54,60,11,0
2,3,65,62,22,0
3,4,34,60,0,1
4,5,38,69,21,0
...,...,...,...,...,...
292,293,65,67,0,1
293,294,53,65,12,0
294,295,57,64,1,0
295,296,54,59,7,1


#Normalisasi
Normalisasi memastikan bahwa variabel-variabel dengan skala yang berbeda-beda memiliki pengaruh yang setara dalam model.

#Standarisasi
Standarisasi mengubah distribusi data sehingga rata-ratanya menjadi 0 dan standar deviasinya menjadi 1, membantu algoritma yang mengasumsikan distribusi normal dan mengurangi pengaruh perbedaan skala antar variabel.

In [None]:
#partisi data
X = df.drop(['y','id'], axis = 1)
y = df.y

#normalisasi
df = (X - X.min()) / (X.max() - X.min())

#standarisasi
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

#konkatenasi data
df = pd.concat([X,y], axis = 1)
df.head()

Unnamed: 0,x1,x2,x3,y
0,0.693554,0.350178,-0.566946,1
1,0.141067,-0.887951,0.946299,0
2,1.15396,-0.268887,2.459543,0
3,-1.700557,-0.887951,-0.566946,1
4,-1.332232,1.897839,2.321976,0


In [None]:
#menghitung tingkat ketepatan model yang dibangun
def akurasi(y_pred,y_true):
    n = len(y_pred)
    benar = 0
    for i in range(n):
       if y_pred[i] == y_true[i]:
        benar += 1
    return benar /n * 100

In [None]:
def separate_by_class(df):
  splitted_df = {}
  for kelas in df.y.unique():
    splitted_df[kelas] = df[df.y == kelas].reset_index(drop=True)
  return splitted_df

separate_by_class(df)[0].head()

Unnamed: 0,x1,x2,x3,y
0,0.141067,-0.887951,0.946299,0
1,1.15396,-0.268887,2.459543,0
2,-1.332232,1.897839,2.321976,0
3,0.693554,-1.197483,1.771705,0
4,0.417311,-0.578419,0.120893,0


In [None]:
def summarize_data(df):
  return df.describe().loc[['mean', 'std','count']]

summarize_data(df)

Unnamed: 0,x1,x2,x3,y
mean,9.569599000000001e-17,-1.046675e-15,-2.8409750000000004e-17,0.737374
std,1.001688,1.001688,1.001688,0.440804
count,297.0,297.0,297.0,297.0


In [None]:
def summarize_by_class(df):
  separated = separate_by_class(df)
  summaries = {}
  for kelas in separated:
      summaries[kelas] = summarize_data(separated[kelas])
  return summaries

summarize_by_class(df)

{1:                x1          x2          x3      y
 mean    -0.048141    0.012378   -0.173716    1.0
 std      1.020798    0.998426    0.818043    0.0
 count  219.000000  219.000000  219.000000  219.0,
 0:               x1         x2         x3     y
 mean    0.135164  -0.034753   0.487740   0.0
 std     0.939087   1.016475   1.278957   0.0
 count  78.000000  78.000000  78.000000  78.0}

#Gaussian Equation

In [None]:
import math

def gaussian_equation(x,mean,std):
  exponen = math.exp(-((x-mean)**2/(2*std**2)))
  return ( 1 / (math.sqrt(2*math.pi) * std)) * exponen

gaussian_equation(1,1,1)

0.3989422804014327

In [None]:
def calculation_probabilites_class(x, summaries):
    n = sum([summaries[kelas].loc['count'].y for kelas in summaries])
    probs = {}
    for kelas in summaries:
        probs[kelas] = summaries[kelas].loc['count'].y / n
        for col in summaries[kelas].columns[:-1]:
            mean, std, _ = summaries[kelas].loc[:, col]
            probs[kelas] *= gaussian_equation(x[col], mean, std) if std != 0 else 0
    # Normalisasi probabilitas
    total_prob = sum(probs.values())
    for kelas in probs:
      if total_prob != 0:
        probs[kelas] /= total_prob
      else:
        probs[kelas] = 0
    return probs

summaries = summarize_by_class(df)
print("Probabilitas kelas baris pertama: ")
calculation_probabilites_class(df.iloc[0],summaries)

Probabilitas kelas baris pertama: 


{1: 0.8456642281280323, 0: 0.15433577187196768}

#Model Generation

Generasi model ke dalam fungsi predict untuk melatih data

In [None]:
def predict(train, test):
    preds = []
    summaries = summarize_by_class(train)
    for _, row in test.iterrows():
        probs = calculation_probabilites_class(row, summaries)
        preds.append(max(probs, key=probs.get))
    return preds

#Data Folding
memastikan bahwa model dinilai pada beragam subset data, mengurangi kemungkinan overfitting atau evaluasi yang terlalu berfokus pada karakteristik tertentu dari dataset. Dengan cara ini, data folding memberikan estimasi kinerja model yang lebih reliabel dan meminimalkan risiko overoptimistik pada hasil evaluasi.

In [None]:
# Menentukan ukuran setiap bagian
size = len(df)
size_per_fold = size // 3

# Mengambil bagian pertama untuk testing dan dua bagian sisanya untuk training
fold_first = (df.iloc[:size_per_fold].reset_index(drop=True), df.iloc[size_per_fold:].reset_index(drop=True))

# Mengambil 50 persen data dengan menyilang masing-masing untuk training dan testing
fold_second = (pd.concat([df.iloc[:size_per_fold], df.iloc[2*size_per_fold:]]).reset_index(drop=True),
               pd.concat([df.iloc[size_per_fold:2*size_per_fold]]).reset_index(drop=True))

# Mengambil quarter akhir untuk testing dan dua quarter awal untuk training
fold_third = (df.iloc[size_per_fold:].reset_index(drop=True), df.iloc[:2*size_per_fold].reset_index(drop=True))

#Pengambilan Data Training terbaik untuk Testing Data Generation
mengambil data training dengan akurasi tertinggi untuk dijadikan validation training data

In [None]:
accuracies = []

for fold_idx, fold in enumerate([fold_first, fold_second, fold_third], start=1):
    train, test = fold

    y_test = test.y

    # Add more print statements if needed
    y_preds = predict(train, test)

    accuracy = akurasi(y_preds, y_test)
    accuracies.append(accuracy)

    print(f'Fold {fold_idx}: Akurasi = {accuracy:.2f}%')

max_accuracy_fold = accuracies.index(max(accuracies)) + 1
print(f'Akurasi tertinggi diperoleh pada Fold ke-{max_accuracy_fold} dengan akurasi {max(accuracies):.2f}%')


Fold 1: Akurasi = 72.73%
Fold 2: Akurasi = 77.78%
Fold 3: Akurasi = 79.29%
Akurasi tertinggi diperoleh pada Fold ke-3 dengan akurasi 79.29%


#Validasi

In [None]:
testdata = files.upload()

Saving testdata.csv to testdata.csv


In [48]:
data = io.BytesIO(testdata["testdata.csv"])
df = pd.read_csv(data, sep = None, engine = 'python')
df

Unnamed: 0,id,x1,x2,x3,y
0,297,43,59,2,?
1,298,67,66,0,?
2,299,58,60,3,?
3,300,49,63,3,?
4,301,45,60,0,?
5,302,54,58,1,?
6,303,56,66,3,?
7,304,42,69,1,?
8,305,50,59,2,?
9,306,59,60,0,?


In [None]:
df = df.drop("id", axis = 1)
df

Unnamed: 0,x1,x2,x3,y
0,43,59,2,?
1,67,66,0,?
2,58,60,3,?
3,49,63,3,?
4,45,60,0,?
5,54,58,1,?
6,56,66,3,?
7,42,69,1,?
8,50,59,2,?
9,59,60,0,?


In [None]:
def generate_data_from_model():
  preds = []
  train = fold_third[0]
  test = df
  summaries = summarize_by_class(train)
  for _, row in test.iterrows():
        # Cek apakah nilai y pada baris saat ini adalah '?'
        if row['y'] == '?':
            # Hitung probabilitas menggunakan fungsi calculation_probabilites_class
            probs = calculation_probabilites_class(row.drop('y'), summaries)
            # Ambil kelas dengan probabilitas tertinggi sebagai hasil prediksi
            predicted_class = max(probs, key=probs.get)
            # Ganti nilai '?' dengan hasil prediksi
            row['y'] = predicted_class
        preds.append(row['y'])
  return preds

generate_data_from_model()

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

##Save output to file

In [50]:
predicted_values = generate_data_from_model()
df['y'] = predicted_values
df


Unnamed: 0,id,x1,x2,x3,y
0,297,43,59,2,1
1,298,67,66,0,1
2,299,58,60,3,1
3,300,49,63,3,1
4,301,45,60,0,1
5,302,54,58,1,1
6,303,56,66,3,1
7,304,42,69,1,1
8,305,50,59,2,1
9,306,59,60,0,1


In [None]:
df.to_csv('predicted_data.csv', index=False)