In [1]:
import numpy as np
import pandas as pd
from random import shuffle

In [2]:
store_codes = np.arange(1,10)
product_codes = np.arange(1,101)

In [3]:
date_range = pd.date_range(start = "2019-01-01", end = "2020-12-31", freq="D")

In [4]:
index = pd.MultiIndex.from_product(
   [date_range, store_codes, product_codes],
   names = ["Date", "StoreCode", "ProductCode"]
)
sales = pd.DataFrame(index = index)
sales.head(105)

Date,StoreCode,ProductCode
2019-01-01,1,1
2019-01-01,1,2
2019-01-01,1,3
2019-01-01,1,4
2019-01-01,1,5
2019-01-01,...,...
2019-01-01,2,1
2019-01-01,2,2
2019-01-01,2,3
2019-01-01,2,4


In [5]:
sales.reset_index(inplace=True)
sales.head()

Unnamed: 0,Date,StoreCode,ProductCode
0,2019-01-01,1,1
1,2019-01-01,1,2
2,2019-01-01,1,3
3,2019-01-01,1,4
4,2019-01-01,1,5


In [6]:
sales.tail()

Unnamed: 0,Date,StoreCode,ProductCode
657895,2020-12-31,9,96
657896,2020-12-31,9,97
657897,2020-12-31,9,98
657898,2020-12-31,9,99
657899,2020-12-31,9,100


In [42]:
store_groups = ["Small","Medium","Large"]*3
shuffle(store_groups)
stores = pd.DataFrame({
   "StoreCode": np.random.randint(1,10),
   "StoreGroup": store_groups
})
stores

Unnamed: 0,StoreCode,StoreGroup
0,4,Small
1,4,Medium
2,4,Small
3,4,Large
4,4,Large
5,4,Medium
6,4,Large
7,4,Small
8,4,Medium


In [8]:
stores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   StoreCode   9 non-null      int64 
 1   StoreGroup  9 non-null      object
dtypes: int64(1), object(1)
memory usage: 272.0+ bytes


In [9]:
product_groups = ["A","B","C","D"] * 25
shuffle(product_groups)
products = pd.DataFrame({
   "ProductCode": np.arange(1,101),
   "ProductGroup": product_groups
})
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ProductCode   100 non-null    int64 
 1   ProductGroup  100 non-null    object
dtypes: int64(1), object(1)
memory usage: 1.7+ KB


In [10]:
products

Unnamed: 0,ProductCode,ProductGroup
0,1,C
1,2,A
2,3,D
3,4,A
4,5,A
...,...,...
95,96,A
96,97,A
97,98,A
98,99,A


In [11]:
comp = [
    "comp1",
"comp2",
"comp3",
"comp4",
"comp5",
"comp6",
"comp7",
"comp8",
"comp9",
"comp10",
"comp11",
"comp12",
"comp13",
"comp14",
"comp15",
"comp16",
"comp17",
"comp18",
"comp19",
"comp20",
"comp21",
"comp22",
"comp23",
"comp24",
"comp25"]
date_range = pd.date_range(start = "2015-01-01 06:00:00", end = "2015-12-31 06:00:00", freq="D")
shuffle(comp)
machineID = np.arange(1,101) #rand
# shuffle(machineID)

index = pd.MultiIndex.from_product(
   [date_range, machineID, comp],
   names = ["datetime", "machineID", "comp"]
)
maint = pd.DataFrame(index = index)
maint.reset_index(inplace=True)
maint.head()
maint.tail(105)

Unnamed: 0,datetime,machineID,comp
912395,2015-12-31 06:00:00,96,comp2
912396,2015-12-31 06:00:00,96,comp1
912397,2015-12-31 06:00:00,96,comp25
912398,2015-12-31 06:00:00,96,comp19
912399,2015-12-31 06:00:00,96,comp20
...,...,...,...
912495,2015-12-31 06:00:00,100,comp2
912496,2015-12-31 06:00:00,100,comp1
912497,2015-12-31 06:00:00,100,comp25
912498,2015-12-31 06:00:00,100,comp19


In [12]:
maint.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912500 entries, 0 to 912499
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   datetime   912500 non-null  datetime64[ns]
 1   machineID  912500 non-null  int64         
 2   comp       912500 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 20.9+ MB


In [13]:

np.random.seed(10)

remove_n = 151863
drop_indices = np.random.choice(maint.index, remove_n, replace=False)
maint = maint.drop(drop_indices)

In [14]:
maint.info()
maint.to_csv("maint_mock.csv", sep='\t')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 760637 entries, 1 to 912499
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   datetime   760637 non-null  datetime64[ns]
 1   machineID  760637 non-null  int64         
 2   comp       760637 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 23.2+ MB


In [15]:
maint.head()

Unnamed: 0,datetime,machineID,comp
1,2015-01-01 06:00:00,1,comp22
2,2015-01-01 06:00:00,1,comp13
4,2015-01-01 06:00:00,1,comp11
6,2015-01-01 06:00:00,1,comp10
7,2015-01-01 06:00:00,1,comp24


In [16]:
maint.tail(105)

Unnamed: 0,datetime,machineID,comp
912373,2015-12-31 06:00:00,95,comp19
912374,2015-12-31 06:00:00,95,comp20
912375,2015-12-31 06:00:00,96,comp15
912377,2015-12-31 06:00:00,96,comp13
912378,2015-12-31 06:00:00,96,comp17
...,...,...,...
912493,2015-12-31 06:00:00,100,comp14
912494,2015-12-31 06:00:00,100,comp23
912495,2015-12-31 06:00:00,100,comp2
912498,2015-12-31 06:00:00,100,comp19


In [17]:
pd.set_option('display.max_rows', 500)
rows = maint['machineID'].groupby(maint['datetime']).value_counts()
rows.head(500)

datetime             machineID
2015-01-01 06:00:00  26           24
                     43           24
                     89           24
                     16           23
                     21           23
                     45           23
                     50           23
                     75           23
                     80           23
                     87           23
                     95           23
                     99           23
                     100          23
                     1            22
                     15           22
                     22           22
                     23           22
                     31           22
                     33           22
                     34           22
                     37           22
                     38           22
                     54           22
                     55           22
                     56           22
                     58           22
       

In [18]:
#delete random

In [19]:
comp = [
    "comp1",
"comp2",
"comp3",
"comp4",
"comp5",
"comp6",
"comp7",
"comp8",
"comp9",
"comp10",
"comp11",
"comp12",
"comp13",
"comp14",
"comp15",
"comp16",
"comp17",
"comp18",
"comp19",
"comp20",
"comp21",
"comp22",
"comp23",
"comp24",
"comp25"]
date_range = pd.date_range(start = "2015-01-01 06:00:00", end = "2015-12-31 06:00:00", freq="D")
shuffle(comp)
machineID = np.arange(1,101) #rand
# shuffle(machineID)

index = pd.MultiIndex.from_product(
   [date_range, machineID, comp],
   names = ["datetime", "machineID", "comp"]
)
failures = pd.DataFrame(index = index)
failures.reset_index(inplace=True)
failures.head()
failures.tail(105)
failures.info()
failures.to_csv("failures_mock.csv", sep='\t')

np.random.seed(10)

remove_n = 451863
drop_indices = np.random.choice(failures.index, remove_n, replace=False)
failures = failures.drop(drop_indices)
failures.info()

failures
# rows = failures['machineID'].groupby(failures['comp']).value_counts()
# rows.head(500)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912500 entries, 0 to 912499
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   datetime   912500 non-null  datetime64[ns]
 1   machineID  912500 non-null  int64         
 2   comp       912500 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 20.9+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 460637 entries, 1 to 912498
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   datetime   460637 non-null  datetime64[ns]
 1   machineID  460637 non-null  int64         
 2   comp       460637 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 14.1+ MB


Unnamed: 0,datetime,machineID,comp
1,2015-01-01 06:00:00,1,comp2
2,2015-01-01 06:00:00,1,comp25
6,2015-01-01 06:00:00,1,comp5
10,2015-01-01 06:00:00,1,comp9
12,2015-01-01 06:00:00,1,comp10
...,...,...,...
912490,2015-12-31 06:00:00,100,comp13
912491,2015-12-31 06:00:00,100,comp22
912494,2015-12-31 06:00:00,100,comp20
912495,2015-12-31 06:00:00,100,comp14


In [20]:
failures

Unnamed: 0,datetime,machineID,comp
1,2015-01-01 06:00:00,1,comp2
2,2015-01-01 06:00:00,1,comp25
6,2015-01-01 06:00:00,1,comp5
10,2015-01-01 06:00:00,1,comp9
12,2015-01-01 06:00:00,1,comp10
...,...,...,...
912490,2015-12-31 06:00:00,100,comp13
912491,2015-12-31 06:00:00,100,comp22
912494,2015-12-31 06:00:00,100,comp20
912495,2015-12-31 06:00:00,100,comp14


In [21]:
date_range = pd.date_range(start = "2015-01-01 06:00:00", end = "2015-12-31 06:00:00", freq="D")
date_range

DatetimeIndex(['2015-01-01 06:00:00', '2015-01-02 06:00:00',
               '2015-01-03 06:00:00', '2015-01-04 06:00:00',
               '2015-01-05 06:00:00', '2015-01-06 06:00:00',
               '2015-01-07 06:00:00', '2015-01-08 06:00:00',
               '2015-01-09 06:00:00', '2015-01-10 06:00:00',
               ...
               '2015-12-22 06:00:00', '2015-12-23 06:00:00',
               '2015-12-24 06:00:00', '2015-12-25 06:00:00',
               '2015-12-26 06:00:00', '2015-12-27 06:00:00',
               '2015-12-28 06:00:00', '2015-12-29 06:00:00',
               '2015-12-30 06:00:00', '2015-12-31 06:00:00'],
              dtype='datetime64[ns]', length=365, freq='D')

In [22]:

min_date = pd.to_datetime("2015-01-01 06:00:00")
max_date = pd.to_datetime("2015-12-31 06:00:00")

d = (max_date - min_date).days + 1

date_range = min_date + pd.to_timedelta(pd.np.random.randint(d,size=88799), unit='d')

  


In [23]:
date_range.value_counts()

2015-08-03 06:00:00    292
2015-10-29 06:00:00    283
2015-11-25 06:00:00    281
2015-02-28 06:00:00    281
2015-08-27 06:00:00    281
2015-06-05 06:00:00    281
2015-08-25 06:00:00    279
2015-02-24 06:00:00    278
2015-05-18 06:00:00    275
2015-06-23 06:00:00    275
2015-03-20 06:00:00    273
2015-03-14 06:00:00    272
2015-08-01 06:00:00    272
2015-03-18 06:00:00    272
2015-12-19 06:00:00    271
2015-03-16 06:00:00    270
2015-04-30 06:00:00    270
2015-02-04 06:00:00    269
2015-11-28 06:00:00    269
2015-07-22 06:00:00    267
2015-08-29 06:00:00    267
2015-10-22 06:00:00    266
2015-01-31 06:00:00    266
2015-07-17 06:00:00    266
2015-04-05 06:00:00    266
2015-04-19 06:00:00    265
2015-11-14 06:00:00    265
2015-01-21 06:00:00    265
2015-10-08 06:00:00    265
2015-11-02 06:00:00    264
2015-02-27 06:00:00    264
2015-01-05 06:00:00    264
2015-03-25 06:00:00    264
2015-06-22 06:00:00    264
2015-05-07 06:00:00    264
2015-09-07 06:00:00    264
2015-01-09 06:00:00    264
2

In [24]:
comp = [
    "comp1",
"comp2",
"comp3",
"comp4",
"comp5",
"comp6",
"comp7",
"comp8",
"comp9",
"comp10",
"comp11",
"comp12",
"comp13",
"comp14",
"comp15",
"comp16",
"comp17",
"comp18",
"comp19",
"comp20",
"comp21",
"comp22",
"comp23",
"comp24",
"comp25"]
min_date = pd.to_datetime("2015-01-01 06:00:00")
max_date = pd.to_datetime("2015-12-31 06:00:00")

d = (max_date - min_date).days + 1

date_range = min_date + pd.to_timedelta(pd.np.random.randint(d,size=889), unit='d')
# shuffle(comp)
machineID = np.arange(1,101) #rand
# shuffle(machineID)

index = pd.MultiIndex.from_product(
   [date_range, machineID, comp],
   names = ["datetime", "machineID", "comp"]
)
failures = pd.DataFrame(index = index)
failures.reset_index(inplace=True)
failures.head()
failures.tail(105)
failures.info()
# failures.to_csv("failures_mock.csv", sep='\t')

np.random.seed(10)

remove_n = 451863
drop_indices = np.random.choice(failures.index, remove_n, replace=False)
failures = failures.drop(drop_indices)
failures.info()


rows = failures['machineID'].groupby(failures['comp']).value_counts()
rows.head(500)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2222500 entries, 0 to 2222499
Data columns (total 3 columns):
 #   Column     Dtype         
---  ------     -----         
 0   datetime   datetime64[ns]
 1   machineID  int64         
 2   comp       object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 50.9+ MB




In [25]:
failures.head()

Unnamed: 0,datetime,machineID,comp
0,2015-02-27 06:00:00,1,comp1
1,2015-02-27 06:00:00,1,comp2
2,2015-02-27 06:00:00,1,comp3
3,2015-02-27 06:00:00,1,comp4
4,2015-02-27 06:00:00,1,comp5


In [26]:
comp = [
    "comp1",
"comp2",
"comp3",
"comp4",
"comp5",
"comp6",
"comp7",
"comp8",
"comp9",
"comp10",
"comp11",
"comp12",
"comp13",
"comp14",
"comp15",
"comp16",
"comp17",
"comp18",
"comp19",
"comp20",
"comp21",
"comp22",
"comp23",
"comp24",
"comp25"]
date = pd.date_range(start = "2015-01-01 06:00:00", end = "2015-01-31 06:00:00", freq="D")

machineID = np.arange(1,101)
shuffle(comp)
shuffle(machineID)
index = pd.MultiIndex.from_product(
   [date, machineID, comp],
   names = ["datetime", "machineID", "comp"]
)
failures = pd.DataFrame(index = index)

In [27]:
failures

datetime,machineID,comp
2015-01-01 06:00:00,100,comp13
2015-01-01 06:00:00,100,comp21
2015-01-01 06:00:00,100,comp4
2015-01-01 06:00:00,100,comp15
2015-01-01 06:00:00,100,comp17
...,...,...
2015-01-31 06:00:00,84,comp1
2015-01-31 06:00:00,84,comp11
2015-01-31 06:00:00,84,comp2
2015-01-31 06:00:00,84,comp22


In [36]:
import pandas as pd
df = pd.DataFrame({'client':['123AASD45', '2345OPU78', '763LKJ90'], 'frequency':[10,9,2]})

def date_range(n, start='1/1/2011', end='4/1/2011'):
    date_range = pd.date_range(start, end)
    return list(pd.Series(date_range).sample(n))

In [37]:
df

Unnamed: 0,client,frequency
0,123AASD45,10
1,2345OPU78,9
2,763LKJ90,2


In [38]:
df['dates'] = df['frequency'].apply(lambda x: date_range(x))
df_dates = df['dates'].apply(pd.Series).reset_index()
df_dates = df_dates.melt(id_vars='index').dropna().drop(['variable'], axis=1).set_index('index')

In [35]:
df

Unnamed: 0,client,frequency,dates
0,123AASD45,10,"[2011-03-14 00:00:00, 2011-03-01 00:00:00, 201..."
1,2345OPU78,9,"[2011-01-26 00:00:00, 2011-03-08 00:00:00, 201..."
2,763LKJ90,2,"[2011-03-14 00:00:00, 2011-03-25 00:00:00]"


In [57]:
# np.random.seed(seed=1111)
# data = np.random.randint(1, high=100, size=len(date))
date = pd.date_range(start = "2015-01-01 06:00:00", end = "2015-01-31 06:00:00", freq="W")
machineID = np.random.randint(1,101)
df = pd.DataFrame({ 'machineID': machineID, 'date' :date})
df = df.set_index('date')
print(df)

                     machineID
date                          
2015-01-04 06:00:00         92
2015-01-11 06:00:00         92
2015-01-18 06:00:00         92
2015-01-25 06:00:00         92


In [83]:
comp = [
    "comp1",
"comp2",
"comp3",
"comp4",
"comp5",
"comp6",
"comp7",
"comp8",
"comp9",
"comp10",
"comp11",
"comp12",
"comp13",
"comp14",
"comp15",
"comp16",
"comp17",
"comp18",
"comp19",
"comp20",
"comp21",
"comp22",
"comp23",
"comp24",
"comp25"]
min_date = pd.to_datetime("2015-01-01 06:00:00")
max_date = pd.to_datetime("2015-12-31 06:00:00")

d = (max_date - min_date).days + 1

# date_range = min_date + pd.to_timedelta(pd.np.random.randint(d,size=10), unit='d')
date_range = pd.date_range(start = "2015-01-01 06:00:00", end = "2015-01-31 06:00:00", freq="W")

# shuffle(comp)
machineID = np.arange(1,101) #rand
# shuffle(date_range)
shuffle(machineID)
shuffle(comp)

index = pd.MultiIndex.from_product(
   [date_range, machineID, comp],
   names = ["datetime", "machineID", "comp"]
)
failures = pd.DataFrame(index = index)
failures.reset_index(inplace=True)
# failures.head()
# failures.tail(105)
failures.info()
# failures.to_csv("failures_mock.csv", sep='\t')

# np.random.seed(10)

# remove_n = 451863
# drop_indices = np.random.choice(failures.index, remove_n, replace=False)
# failures = failures.drop(drop_indices)
# failures.info()


# rows = failures['machineID'].groupby(failures['comp']).value_counts()
# rows.head(500)




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   datetime   10000 non-null  datetime64[ns]
 1   machineID  10000 non-null  int64         
 2   comp       10000 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 234.5+ KB


In [84]:
failures.head(40)

Unnamed: 0,datetime,machineID,comp
0,2015-01-04 06:00:00,50,comp14
1,2015-01-04 06:00:00,50,comp10
2,2015-01-04 06:00:00,50,comp23
3,2015-01-04 06:00:00,50,comp7
4,2015-01-04 06:00:00,50,comp2
5,2015-01-04 06:00:00,50,comp3
6,2015-01-04 06:00:00,50,comp9
7,2015-01-04 06:00:00,50,comp15
8,2015-01-04 06:00:00,50,comp19
9,2015-01-04 06:00:00,50,comp21


In [88]:
failures=failures.iloc[::2496, :]
failures.info()
failures.head(25)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 9984
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   datetime   5 non-null      datetime64[ns]
 1   machineID  5 non-null      int64         
 2   comp       5 non-null      object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 248.0+ bytes


Unnamed: 0,datetime,machineID,comp
0,2015-01-04 06:00:00,50,comp14
2496,2015-01-04 06:00:00,22,comp13
4992,2015-01-11 06:00:00,22,comp16
7488,2015-01-18 06:00:00,22,comp5
9984,2015-01-25 06:00:00,22,comp21


In [81]:
comp = [
    "comp1",
"comp2",
"comp3",
"comp4",
"comp5",
"comp6",
"comp7",
"comp8",
"comp9",
"comp10",
"comp11",
"comp12",
"comp13",
"comp14",
"comp15",
"comp16",
"comp17",
"comp18",
"comp19",
"comp20",
"comp21",
"comp22",
"comp23",
"comp24",
"comp25"]
min_date = pd.to_datetime("2015-01-01 06:00:00")
max_date = pd.to_datetime("2015-12-31 06:00:00")

d = (max_date - min_date).days + 1

date_range = min_date + pd.to_timedelta(pd.np.random.randint(d,size=1), unit='w')
# shuffle(comp)
machineID = np.arange(1,101) #rand
# shuffle(machineID)

index = pd.MultiIndex.from_product(
   [date_range, machineID, comp],
   names = ["datetime", "machineID", "comp"]
)
failures = pd.DataFrame(index = index)
failures.reset_index(inplace=True)
failures.head()
failures.tail(105)
failures.info()
# failures.to_csv("failures_mock.csv", sep='\t')

# np.random.seed(10)

# remove_n = 40
# drop_indices = np.random.choice(failures.index, remove_n, replace=False)
# failures = failures.drop(drop_indices)
# failures.info()


# rows = failures['machineID'].groupby(failures['comp']).value_counts()
# rows.head(500)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   datetime   2500 non-null   datetime64[ns]
 1   machineID  2500 non-null   int64         
 2   comp       2500 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 58.7+ KB




In [82]:
failures

Unnamed: 0,datetime,machineID,comp
0,2021-10-14 06:00:00,1,comp1
1,2021-10-14 06:00:00,1,comp2
2,2021-10-14 06:00:00,1,comp3
3,2021-10-14 06:00:00,1,comp4
4,2021-10-14 06:00:00,1,comp5
...,...,...,...
2495,2021-10-14 06:00:00,100,comp21
2496,2021-10-14 06:00:00,100,comp22
2497,2021-10-14 06:00:00,100,comp23
2498,2021-10-14 06:00:00,100,comp24


Collecting Faker
  Downloading Faker-10.0.0-py3-none-any.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 4.1 MB/s eta 0:00:01
[?25hCollecting typing-extensions>=3.10.0.2
  Downloading typing_extensions-4.0.1-py3-none-any.whl (22 kB)
Collecting text-unidecode==1.3
  Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 924 kB/s s eta 0:00:01
Installing collected packages: typing-extensions, text-unidecode, Faker
  Attempting uninstall: typing-extensions
    Found existing installation: typing-extensions 3.7.4.3
    Uninstalling typing-extensions-3.7.4.3:
      Successfully uninstalled typing-extensions-3.7.4.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 1.3.3 requires botocore<1.20.107,>=1.20.106, but you have botocore 1.21.13 which is incompatible.
detectron2 0.5 require

NameError: name 'Fake_date' is not defined

In [11]:
comp = [
    "comp1",
"comp2",
"comp3",
"comp4",
"comp5",
"comp6",
"comp7",
"comp8",
"comp9",
"comp10",
"comp11",
"comp12",
"comp13",
"comp14",
"comp15",
"comp16",
"comp17",
"comp18",
"comp19",
"comp20",
"comp21",
"comp22",
"comp23",
"comp24",
"comp25"]
# date = pd.date_range(start = "2015-01-01 06:00:00", end = "2015-01-31 06:00:00", freq="W")

# Fake_date = np.random.choice(pd.date_range("2015-01-01 06:00:00", "2015-01-31 06:00:00"), len(df))
comp = comp * 4
shuffle(comp)

MachineID = np.arange(1,101)
shuffle(MachineID)
products = pd.DataFrame({
   "MachineID": MachineID,
   "comp": comp
   
})
products["Fake_date"]=np.random.choice(pd.date_range("2015-01-01 06:00:00", "2015-03-31 06:00:00"), len(products))
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   MachineID  100 non-null    int64         
 1   comp       100 non-null    object        
 2   Fake_date  100 non-null    datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 2.5+ KB


In [156]:
import numpy 
from numpy.random import default_rng
comp = [
    "comp1",
"comp2",
"comp3",
"comp4",
"comp5",
"comp6",
"comp7",
"comp8",
"comp9",
"comp10",
"comp11",
"comp12",
"comp13",
"comp14",
"comp15",
"comp16",
"comp17",
"comp18",
"comp19",
"comp20",
"comp21",
"comp22",
"comp23",
"comp24",
"comp25"]
# comp = comp * 4
comp = rng.choice(comp, size=7000)
# shuffle(comp)

# MachineID = np.arange(1,101)
# shuffle(MachineID)


rng = default_rng()
MachineID = rng.choice(100, size=7000)

maint = pd.DataFrame({
   "datetime": np.random.choice(pd.date_range("2015-01-01 06:00:00", "2015-12-31 06:00:00"), 7000),
   "machineID" : MachineID,
    "comp" : comp
   
})
maint = maint.sort_values(by='datetime')

maint.set_index('datetime', inplace=True)
maint.info()
# maint.to_csv("maint_mock.csv", sep='\t')

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7000 entries, 2015-01-01 06:00:00 to 2015-12-31 06:00:00
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   machineID  7000 non-null   int64 
 1   comp       7000 non-null   object
dtypes: int64(1), object(1)
memory usage: 164.1+ KB


In [157]:
# remove_n = 2252
# drop_indices = np.random.choice(maint.index, remove_n, replace=True)
# maint = maint.drop(drop_indices)
# maint.info()


In [158]:
removelist = ['2015-01-01 06:00:00','2015-01-03 06:00:00','2015-01-12 06:00:00','2015-01-27 06:00:00','2015-01-29 06:00:00',
              '2015-02-11 06:00:00','2015-02-13 06:00:00','2015-02-26 06:00:00',
              '2015-03-08 06:00:00','2015-03-10 06:00:00','2015-03-26 06:00:00','2015-03-30 06:00:00',
              '2015-04-12 06:00:00','2015-04-23 06:00:00','2015-04-25 06:00:00','2015-04-28 06:00:00',
              
              '2015-05-12 06:00:00','2015-05-14 06:00:00','2015-05-25 06:00:00','2015-05-26 06:00:00','2015-05-28 06:00:00','2015-05-27 06:00:00',
              '2015-06-07 06:00:00','2015-06-08 06:00:00','2015-06-09 06:00:00','2015-06-11 06:00:00','2015-06-28 06:00:00',
              '2015-07-07 06:00:00','2015-07-09 06:00:00','2015-07-11 06:00:00','2015-07-13 06:00:00','2015-07-24 06:00:00','2015-07-26 06:00:00','2015-07-28 06:00:00',
              '2015-08-12 06:00:00','2015-08-22 06:00:00','2015-08-23 06:00:00','2015-08-26 06:00:00','2015-08-27 06:00:00',
              '2015-09-07 06:00:00','2015-09-09 06:00:00','2015-09-11 06:00:00','2015-09-24 06:00:00','2015-09-25 06:00:00','2015-09-27 06:00:00',
              
              '2015-10-07 06:00:00','2015-10-09 06:00:00','2015-10-11 06:00:00','2015-10-16 06:00:00','2015-10-22 06:00:00','2015-10-26 06:00:00',
              '2015-11-05 06:00:00','2015-11-08 06:00:00','2015-11-21 06:00:00','2015-11-22 06:00:00','2015-11-23 06:00:00',
              
              '2015-12-21 06:00:00','2015-12-10 06:00:00','2015-12-08 06:00:00','2015-12-06 06:00:00']

# df[[d.date() not in pd.to_datetime(removelist) for d in df.index]]
# failures[[d.date() not in pd.to_datetime(removelist) for d in failures.index]]
mask = ~np.in1d(maint.index.date, pd.to_datetime(removelist).date)
maint = maint.loc[mask, :]
maint

Unnamed: 0_level_0,machineID,comp
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-02 06:00:00,93,comp17
2015-01-02 06:00:00,69,comp15
2015-01-02 06:00:00,88,comp14
2015-01-02 06:00:00,26,comp21
2015-01-02 06:00:00,33,comp23
...,...,...
2015-12-31 06:00:00,45,comp16
2015-12-31 06:00:00,48,comp15
2015-12-31 06:00:00,12,comp3
2015-12-31 06:00:00,97,comp6


In [161]:
maint.info()
# maint["datetime"].value_counts()
maint["machineID"].value_counts()
maint["comp"].value_counts()
maint.to_csv("maint_mock_test.csv", sep='\t')

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5858 entries, 2015-01-02 06:00:00 to 2015-12-31 06:00:00
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   machineID  5858 non-null   int64 
 1   comp       5858 non-null   object
dtypes: int64(1), object(1)
memory usage: 137.3+ KB


In [111]:
maint

Unnamed: 0,datetime,machineID,comp
4533,2015-01-01 06:00:00,39,comp15
6912,2015-01-01 06:00:00,54,comp3
6033,2015-01-01 06:00:00,22,comp9
1126,2015-01-01 06:00:00,98,comp10
6239,2015-01-01 06:00:00,30,comp22
...,...,...,...
3257,2015-12-31 06:00:00,80,comp23
5364,2015-12-31 06:00:00,5,comp2
6105,2015-12-31 06:00:00,62,comp18
3099,2015-12-31 06:00:00,30,comp22


In [112]:
maint["comp"].value_counts()

comp23    224
comp16    222
comp22    219
comp25    217
comp24    208
comp19    207
comp13    207
comp20    206
comp5     205
comp11    205
comp18    205
comp17    204
comp4     204
comp7     202
comp9     199
comp21    199
comp1     198
comp3     198
comp14    197
comp10    192
comp8     190
comp2     189
comp12    189
comp15    188
comp6     187
Name: comp, dtype: int64

In [113]:
maint.groupby(['datetime','machineID','comp'])['comp'].count().head(35)

datetime             machineID  comp  
2015-01-01 06:00:00  17         comp12    1
                     20         comp10    1
                     22         comp9     1
                     30         comp22    1
                     39         comp15    1
                     54         comp3     1
                     71         comp14    1
                     81         comp22    1
                     86         comp12    1
                     98         comp10    1
2015-01-02 06:00:00  4          comp6     1
                     6          comp19    1
                     9          comp24    1
                                comp3     1
                     19         comp24    1
                     22         comp12    1
                     31         comp5     1
                     32         comp20    1
                     35         comp23    1
                     54         comp22    1
                     68         comp10    1
                     72         comp1

In [114]:
#  maint.to_csv("maint_mock.csv", sep='\t')

In [144]:
comp = [
    "comp1",
"comp2",
"comp3",
"comp4",
"comp5",
"comp6",
"comp7",
"comp8",
"comp9",
"comp10",
"comp11",
"comp12",
"comp13",
"comp14",
"comp15",
"comp16",
"comp17",
"comp18",
"comp19",
"comp20",
"comp21",
"comp22",
"comp23",
"comp24",
"comp25"]
# comp = comp * 4
comp = rng.choice(comp, size=4000)
# shuffle(comp)

# MachineID = np.arange(1,101)
# shuffle(MachineID)


rng = default_rng()
MachineID = rng.choice(100, size=4000)

failures = pd.DataFrame({
   "datetime": np.random.choice(pd.date_range("2015-01-01 06:00:00", "2015-12-31 06:00:00"), 4000),
   "machineID" : MachineID,
    "failure" : comp
   
})
failures = failures.sort_values(by='datetime')

failures.set_index('datetime', inplace=True)


In [145]:
# remove_n = 1541
# drop_indices = np.random.choice(failures.index, remove_n, replace=True)
# failures = failures.drop(drop_indices)
failures.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4000 entries, 2015-01-01 06:00:00 to 2015-12-31 06:00:00
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   machineID  4000 non-null   int64 
 1   failure    4000 non-null   object
dtypes: int64(1), object(1)
memory usage: 93.8+ KB


In [146]:
# 

In [147]:
# removelist = ['2015-01-04', '2015-01-24' ,'2015-01-29']
removelist = ['2015-01-01 06:00:00','2015-01-03 06:00:00','2015-01-12 06:00:00','2015-01-27 06:00:00',
              '2015-02-11 06:00:00','2015-02-13 06:00:00','2015-02-26 06:00:00','2015-02-28 06:00:00',
              '2015-03-08 06:00:00','2015-03-10 06:00:00','2015-03-26 06:00:00','2015-03-30 06:00:00',
              '2015-04-12 06:00:00','2015-04-23 06:00:00','2015-04-25 06:00:00','2015-04-28 06:00:00',
              '2015-05-12 06:00:00','2015-05-14 06:00:00','2015-05-25 06:00:00','2015-05-26 06:00:00','2015-05-28 06:00:00',
              '2015-06-07 06:00:00','2015-06-08 06:00:00','2015-06-09 06:00:00','2015-06-10 06:00:00','2015-06-11 06:00:00','2015-06-28 06:00:00',
              '2015-07-07 06:00:00','2015-07-09 06:00:00','2015-07-11 06:00:00','2015-07-13 06:00:00','2015-07-24 06:00:00','2015-07-26 06:00:00','2015-07-28 06:00:00',
              '2015-08-12 06:00:00','2015-08-22 06:00:00','2015-08-23 06:00:00','2015-08-26 06:00:00','2015-08-27 06:00:00',
              '2015-09-07 06:00:00','2015-09-09 06:00:00','2015-09-11 06:00:00','2015-09-24 06:00:00','2015-09-25 06:00:00','2015-09-27 06:00:00',
              
              '2015-10-07 06:00:00','2015-10-09 06:00:00','2015-10-11 06:00:00','2015-10-16 06:00:00','2015-10-22 06:00:00','2015-10-26 06:00:00',
              '2015-11-05 06:00:00','2015-11-08 06:00:00','2015-11-21 06:00:00','2015-11-22 06:00:00','2015-11-23 06:00:00',
              
              '2015-12-21 06:00:00','2015-12-10 06:00:00','2015-12-08 06:00:00','2015-12-06 06:00:00']

# df[[d.date() not in pd.to_datetime(removelist) for d in df.index]]
# failures[[d.date() not in pd.to_datetime(removelist) for d in failures.index]]
mask = ~np.in1d(failures.index.date, pd.to_datetime(removelist).date)
failures = failures.loc[mask, :]
failures

Unnamed: 0_level_0,machineID,failure
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-02 06:00:00,95,comp14
2015-01-02 06:00:00,81,comp12
2015-01-02 06:00:00,79,comp11
2015-01-02 06:00:00,3,comp4
2015-01-02 06:00:00,80,comp24
...,...,...
2015-12-31 06:00:00,67,comp7
2015-12-31 06:00:00,41,comp23
2015-12-31 06:00:00,93,comp23
2015-12-31 06:00:00,26,comp14


In [154]:
failures.to_csv("failures_mock_test.csv", sep='\t')

In [153]:
failures.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3282 entries, 2015-01-02 06:00:00 to 2015-12-31 06:00:00
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   machineID  3282 non-null   int64 
 1   failure    3282 non-null   object
dtypes: int64(1), object(1)
memory usage: 156.9+ KB


In [99]:
failures.groupby(['datetime','machineID','failure'])['failure'].count().head(35)

datetime             machineID  failure
2015-01-01 06:00:00  54         comp13     1
                     59         comp8      1
2015-01-02 06:00:00  20         comp18     1
2015-01-03 06:00:00  40         comp11     1
2015-01-04 06:00:00  30         comp17     1
                     31         comp1      1
                     68         comp19     1
                     72         comp7      1
                     86         comp23     1
2015-01-05 06:00:00  18         comp16     1
                     21         comp12     1
                     51         comp1      1
2015-01-06 06:00:00  16         comp25     1
                     31         comp10     1
2015-01-07 06:00:00  28         comp18     1
                     64         comp21     1
2015-01-08 06:00:00  48         comp5      1
                     66         comp22     1
                     98         comp4      1
2015-01-09 06:00:00  24         comp9      1
                     26         comp16     1
               