In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
import random
from sklearn.neighbors import NearestNeighbors
from utils import one_hot, resize_to_ori_calMRE, resize_to_ori


In [2]:
folder = 'SMOTE_alt4/'
gen_results = 'generated_results/'

model = 'model_'+'.ckpt'

if not os.path.isdir(folder):
    os.mkdir(folder)
    
if not os.path.isdir(folder + gen_results):
    os.mkdir(folder + gen_results)

# save ckpt
saver_path = os.path.join(folder, model)

# read ckpt
restore_path = os.path.join(folder)

# save generated data
generated_path = os.path.join(folder + gen_results)

In [3]:
img_size = np.math.factorial(6)
img_width = 30
img_height = 24
ori_size = np.math.factorial(4)

data_alt4 = pd.read_csv('./data/netflix_data_4alt_resize.csv')
data_alt4 = data_alt4.iloc[:1000,1:].values

data_alt4 = data_alt4.reshape([-1, img_height, img_width])
print(data_alt4.shape)

data_alt4_ori = resize_to_ori_calMRE(data_alt4, img_size, img_width, img_height, ori_size)
print(data_alt4_ori.shape)

(1000, 24, 30)
(1000, 24)


In [4]:
class Smote:
    """
    SMOTE过采样算法.


    Parameters:
    -----------
    k: int
        选取的近邻数目.
    sampling_rate: int
        采样倍数, attention sampling_rate < k.
    newindex: int
        生成的新样本(合成样本)的索引号.
    """
    def __init__(self, sampling_rate=5, k=5):
        self.sampling_rate = sampling_rate
        self.k = k
        self.newindex = 0

    def fit(self, X, y=None):
        if y is not None:
            negative_X = X[y==0]
            X = X[y==1]

        n_samples, n_features = X.shape
        # 初始化一个矩阵, 用来存储合成样本
        self.synthetic = np.zeros((n_samples * self.sampling_rate, n_features))

        # 找出正样本集(数据集X)中的每一个样本在数据集X中的k个近邻
        knn = NearestNeighbors(n_neighbors=self.k).fit(X)
        for i in range(len(X)):
            k_neighbors = knn.kneighbors(X[i].reshape(1,-1), 
                                         return_distance=False)[0]
            # 对正样本集(minority class samples)中每个样本, 分别根据其k个近邻生成
            # sampling_rate个新的样本
            self.synthetic_samples(X, i, k_neighbors)

        if y is not None:
            return ( np.concatenate((self.synthetic, X, negative_X), axis=0), 
                     np.concatenate(([1]*(len(self.synthetic)+len(X)), y[y==0]), axis=0) )

#         return np.concatenate((self.synthetic, X), axis=0)
        return self.synthetic

    # 对正样本集(minority class samples)中每个样本, 分别根据其k个近邻生成sampling_rate个新的样本
    def synthetic_samples(self, X, i, k_neighbors):
        for j in range(self.sampling_rate):
            # 从k个近邻里面随机选择一个近邻
            neighbor = np.random.choice(k_neighbors)
            # 计算样本X[i]与刚刚选择的近邻的差
            diff = X[neighbor] - X[i]
            # 生成新的数据
            self.synthetic[self.newindex] = X[i] + random.random() * diff
            self.newindex += 1
            

In [5]:
gen_dataset = []

for i in range(int(20000/1000)):
    smote = Smote(sampling_rate = 1, k = 20)
    gen_alt4 = smote.fit(data_alt4_ori)
    gen_dataset.append(gen_alt4)
    
# gen_dataset = np.array(gen_dataset)#.reshape()
# print(gen_dataset.shape)

In [6]:
gen_dataset = np.array(gen_dataset).reshape([-1, data_alt4_ori.shape[-1]])
print(gen_dataset.shape)

(20000, 24)


In [7]:
gen_alt_pd = pd.DataFrame(gen_dataset, columns = ['ABCD', 'ACBD', 'BACD', 'BCAD', 'CABD', 'CBAD', 'DABC',
                           'DACB', 'DBAC', 'DBCA', 'DCAB', 'DCBA', 'ADBC', 'ADCB', 'BDAC', 'BDCA',
                           'CDAB', 'CDBA', 'ABDC', 'ACDB', 'BADC', 'BCDA', 'CADB', 'CBDA'])
gen_alt_pd.to_csv(generated_path + 'generated_atl4' + '.csv')                    
                    