In [1]:
import warnings
from tqdm import tqdm
warnings.filterwarnings('ignore')

In [2]:
from scipy.optimize import leastsq
from scipy.optimize import least_squares
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from pathlib import Path
import itertools
import sys
from sklearn.preprocessing import MinMaxScaler
import statistics
from IPython.display import clear_output
import time
import pickle
from scipy import spatial
from scipy.signal import find_peaks

In [75]:
plt.rcParams["figure.figsize"] = (20,20)

In [3]:
#load igor clusters file
clustered_techs_dict={}
with open('clustered_techs_dict.pickle', 'rb') as handle:
    clustered_techs_dict = pickle.load(handle)

In [5]:
#create a reverse dict, for each tech list possible anchors
techs_dict={}
for key,val in clustered_techs_dict.items():
    for tech in val:
        if tech not in techs_dict:
            techs_dict[tech]={}
        for anchor in key[0]:
            if anchor not in techs_dict[tech]:
                techs_dict[tech][anchor]=0
            techs_dict[tech][anchor]=techs_dict[tech][anchor]+key[1]
        techs_dict[tech]=techs_dict[tech]

In [76]:
#test, plot anchores histogram
lst=[]
for key,val in techs_dict.items():
#     print(key)
    lst=lst+list(val.keys())
plt.hist(lst, len(list(set(lst))))

In [84]:
filepath="./features/"
scaler = MinMaxScaler()

anchors=['Lithium ion battery',
 'Cryptocurrency',
 '5g',
 'Open source model',
 'Facial recognition',
 'Cloud computing',
 'Streaming television',
 'e-Reader',
 'Capsule endoscopy',
 'Quadcopter',
 'Electronic cigarette',
 'Miniaturized satellite',
 'Electric vehicle']
anchors = [x.lower() for x in anchors]

In [89]:
#remove selected anchors
for item in [
    'streaming television', 
             'miniaturized satellite',
             "Open source model",
            "Electronic cigarette",
            "e-Reader",
            "Facial recognition"]:
    anchors.remove(item)

ValueError: list.remove(x): x not in list

In [45]:
#features smoothing loops
#for each remporal feature set the X axis as montly date from 1.1.10 to 1.11.21

f_lst={}
idx = pd.date_range('01-01-2010', '11-1-2021', freq='M')+ pd.DateOffset(1)
files = Path(filepath).rglob("*.csv")
for file in list(files)[:]:
    df=pd.read_csv(file,index_col=0,header=0).T
    df.iloc[:,:]=scaler.fit_transform(df.iloc[:,:])
    df.index = pd.DatetimeIndex(df.index)
    df = df.reindex(idx, fill_value=0)
    df=df.T
    df.index=df.index.str.lower()
    anc_len=len(list(set(anchors)-set(list(df.index))))
    if anc_len<1:
        print(file.name[:-4],df.shape)
        f_lst[file.name[:-4]]=df.copy()
    else:
        pass
#         print("csv error:",anc_len,file.name[:-4],df.shape,)


files = Path(filepath).rglob("*.xlsx")    
for file in list(files)[:]:
    
    df=pd.read_excel(file,index_col=0).T
    
    df.iloc[:,:]=scaler.fit_transform(df.iloc[:,:])
    df.index = pd.DatetimeIndex(df.index)
    df = df.reindex(idx, fill_value=0)
    df=df.T
    df.index=df.index.str.lower()
    anc_len=len(list(set(anchors)-set(list(df.index))))
    if anc_len<1:
        print(file.name[:-4],df.shape)
        f_lst[file.name[:-4]]=df.copy()
    else:
        print("excel error:",anc_len,file.name[:-4],df.shape,)


wiki_edits_monthly_count (419, 142)
wiki_edits_monthly_max_size (419, 142)
news_R&D (226, 142)
scientific_emergence (185, 142)
news_China (231, 142)
news_India (204, 142)
news_Israel (110, 142)
All_14112021210257. (430, 142)
Business & Industrial_14112021210433. (428, 142)
Finance_14112021210413. (411, 142)
Law & Government_14112021210335. (422, 142)
News_14112021210354. (415, 142)
No_publications_26102021212111. (440, 142)
Science_14112021210316. (428, 142)
wiki_views. (351, 142)
word_freq_techs_table. (430, 142)


In [10]:
#remove china news due to bias found on spark databases
del f_lst["news_China"]

In [46]:
list(f_lst.keys())

['wiki_edits_monthly_count',
 'wiki_edits_monthly_max_size',
 'news_R&D',
 'scientific_emergence',
 'news_China',
 'news_India',
 'news_Israel',
 'All_14112021210257.',
 'Business & Industrial_14112021210433.',
 'Finance_14112021210413.',
 'Law & Government_14112021210335.',
 'News_14112021210354.',
 'No_publications_26102021212111.',
 'Science_14112021210316.',
 'wiki_views.',
 'word_freq_techs_table.']

In [48]:
def plot_bass(sales,t,name,plot=True,overfit=False,graph=False):
    c_sales=np.cumsum(sales)
    # initial variables(M, P & Q)
    m=sales.sum()
    #[P,Q,M]
    vars = [0.003,0.038,m]

    # residual (error) function
    
    def residual(vars,m, t, sales):
        
        M=m
        if overfit:
            M = vars[2]
        P = vars[0]
        Q = vars[1]
        Bass = M * (((P+Q)**2/P)*np.exp(-(P+Q)*t))/(1+(Q/P)*np.exp(-(P+Q)*t))**2 
        return (Bass - (sales))

    # non linear least square fitting
    #varfinal,success
    if overfit:
        res= least_squares(residual, vars,
#                            bounds=([0.001, 0.01,0], [0.01,0.1,np.inf]), 
                           args=(m,t, sales))
    else:
        res= least_squares(residual, vars,
                           bounds=([0.001, 0.01,0], [0.01,0.1,np.inf]), 
                           args=(m,t, sales))
    

    # estimated coefficients
    
    p = res.x[0]
    q = res.x[1]
    
    m=m
    if overfit:
        m = res.x[2]
    factor=10
    if plot:
        print(residual([p,q,m],m, t, sales).mean())
        print(f"m:{m} \np:{p} \nq:{q} \n")
    
#     print(res)
#     sys.exit()
    
    
    #sales plot (pdf)
    #time interpolation
    if plot:
        plt.subplot(1, 2, 1)
        tp=(np.linspace(1.0, sales.shape[0]*factor, num=sales.shape[0]*factor))/factor
        cofactor= np.exp(-(p+q) * tp)
        sales_pdf= m* (((p+q)**2/p)*cofactor)/(1+(q/p)*cofactor)**2
        plt.plot(tp, sales_pdf,t,sales)
        plt.title(f'{name} pdf')
        plt.legend(['Fit', 'True'])


        # Cumulative sales (cdf)
        plt.subplot(1, 2, 2)
        sales_cdf= m*(1-cofactor)/(1+(q/p)*cofactor)
        plt.plot(tp, sales_cdf,t,c_sales)
        plt.title('Sales cdf')
        plt.legend(['Fit', 'True'])

        plt.show()
    if graph:
        return sales_pdf,res.x,res.cost
    else:
        return res.cost

In [49]:
#create every possible features subset with a max kength of 4
subset_lst=[]
for L in range(1,4):# len(list(f_lst.keys()))+1):
    for subset in itertools.combinations(list(f_lst.keys()), L):
        subset_lst.append(list(subset))
subset_lst

[['wiki_edits_monthly_count'],
 ['wiki_edits_monthly_max_size'],
 ['news_R&D'],
 ['scientific_emergence'],
 ['news_China'],
 ['news_India'],
 ['news_Israel'],
 ['All_14112021210257.'],
 ['Business & Industrial_14112021210433.'],
 ['Finance_14112021210413.'],
 ['Law & Government_14112021210335.'],
 ['News_14112021210354.'],
 ['No_publications_26102021212111.'],
 ['Science_14112021210316.'],
 ['wiki_views.'],
 ['word_freq_techs_table.'],
 ['wiki_edits_monthly_count', 'wiki_edits_monthly_max_size'],
 ['wiki_edits_monthly_count', 'news_R&D'],
 ['wiki_edits_monthly_count', 'scientific_emergence'],
 ['wiki_edits_monthly_count', 'news_China'],
 ['wiki_edits_monthly_count', 'news_India'],
 ['wiki_edits_monthly_count', 'news_Israel'],
 ['wiki_edits_monthly_count', 'All_14112021210257.'],
 ['wiki_edits_monthly_count', 'Business & Industrial_14112021210433.'],
 ['wiki_edits_monthly_count', 'Finance_14112021210413.'],
 ['wiki_edits_monthly_count', 'Law & Government_14112021210335.'],
 ['wiki_edits

In [50]:
#collect and sort every possible anchor subset from the clustering dict

anchor_subset=[]
for val in techs_dict.values():
    anchor_subset.append(list(val.keys()))

anchor_subset.sort()
anchor_subset=list(anchor_subset for anchor_subset,_ in itertools.groupby(anchor_subset))
len(anchor_subset)

239

In [52]:
#for each anchor/features subset pairs calculate the features with best possible score
#to avoid overfit we filter by the ratio of the normal score to the overfit score


%%time
errors=[]
cost_list=[]
N=60
j=0
top_n=10

start=time.time()
j+=1
for anchor_list in tqdm(anchor_subset):
    for subset in subset_lst:
        cost=[]
        of_cost=[]
        for name in anchor_list:

            sales=[]
            skip_subset=False
            for i in subset:
                df=f_lst[i]
                if df.loc[df.index.str.lower()==name.lower(),:].shape[0]>0:
                    sales.append(df.loc[df.index.str.lower()==name.lower(),:].values[0])
                else:
                    skip_subset=True
                    break
            if skip_subset:
                break

            sales=np.add.reduce(sales)
            if sales.sum()>0:
                sales=np.trim_zeros(sales, 'f').astype(float)
                sales=np.convolve(sales, np.ones(N)/N, mode='valid')
                t= np.linspace(1.0, sales.shape[0], num=sales.shape[0])
                cost.append(plot_bass(sales,t,name,plot=False))
                of_cost.append(plot_bass(sales,t,name,plot=False,overfit=True))

            else:
                errors.append(f"subset:{subset} name:{name}")
                cost.append(10.0)
                of_cost.append(10.0)
        if cost:
            cost_list.append({"anchor_list":anchor_list,"subset":subset,"length":len(subset),"cost":statistics.mean(cost),"overfit cost":statistics.mean(of_cost)}) 

100%|█████████████████████████████████████████████████████████████████████████████| 239/239 [6:44:49<00:00, 101.63s/it]

Wall time: 6h 44min 49s





In [53]:
top_n=50

In [54]:
#sort and take top 50 results for each feature subset
df_metric=pd.DataFrame(cost_list).sort_values(by=["cost","length"])
df_metric["ratio"]=df_metric["overfit cost"]/df_metric["cost"]
pd.set_option('display.max_colwidth', 0)
conds=((df_metric["ratio"]>0.0) & (df_metric["length"]>1))
df_temp_metric=df_metric.loc[conds]
df_temp_metric

Unnamed: 0,anchor_list,subset,length,cost,overfit cost,ratio
14633,[capsule endoscopy],"[wiki_edits_monthly_count, news_R&D]",2,0.000004,0.000001,0.345481
14634,[capsule endoscopy],"[wiki_edits_monthly_count, scientific_emergence]",2,0.000004,0.000001,0.345481
14635,[capsule endoscopy],"[wiki_edits_monthly_count, news_China]",2,0.000004,0.000001,0.345481
14636,[capsule endoscopy],"[wiki_edits_monthly_count, news_India]",2,0.000004,0.000001,0.345481
14637,[capsule endoscopy],"[wiki_edits_monthly_count, news_Israel]",2,0.000004,0.000001,0.345481
...,...,...,...,...,...,...
149414,"[streaming television, reusable launch system, miniaturized satellite]","[News_14112021210354., Science_14112021210316., word_freq_techs_table.]",3,10.000000,10.000000,1.000000
149415,"[streaming television, reusable launch system, miniaturized satellite]","[News_14112021210354., wiki_views., word_freq_techs_table.]",3,10.000000,10.000000,1.000000
149417,"[streaming television, reusable launch system, miniaturized satellite]","[No_publications_26102021212111., Science_14112021210316., word_freq_techs_table.]",3,10.000000,10.000000,1.000000
149418,"[streaming television, reusable launch system, miniaturized satellite]","[No_publications_26102021212111., wiki_views., word_freq_techs_table.]",3,10.000000,10.000000,1.000000


In [56]:
exclude_features=[
    "news_R&D" ,
"scientific_emergence" ,
"news_China" ,
"news_India" ,
"news_Israel" ,
]

In [None]:
df_list=[]

In [59]:

# for anchor_list in tqdm(anchor_subset):
#     df_temp=df_temp_metric[df_temp_metric["anchor_list"].astype(str)==str(anchor_list)].head(10)
#     for row in df_temp_metric.iterrows():
#         if len(set(row[1]["subset"]).intersection(exclude_features))==0:
#             df_list.append(row[1].to_frame().T)
#             break

In [60]:
for anchor_list in tqdm(anchor_subset):
    df_list.append(df_temp_metric[df_temp_metric["anchor_list"].astype(str)==str(anchor_list)].head(5))

100%|████████████████████████████████████████████████████████████████████████████████| 239/239 [00:29<00:00,  8.08it/s]


In [61]:
df_temp_metric_ver_2=pd.concat(df_list).reset_index()
df_temp_metric_ver_2

Unnamed: 0,index,anchor_list,subset,length,cost,overfit cost,ratio
0,17,[5g],"[wiki_edits_monthly_count, news_R&D]",2,0.000199,0.000058,0.289277
1,18,[5g],"[wiki_edits_monthly_count, scientific_emergence]",2,0.000199,0.000058,0.289277
2,19,[5g],"[wiki_edits_monthly_count, news_China]",2,0.000199,0.000058,0.289277
3,20,[5g],"[wiki_edits_monthly_count, news_India]",2,0.000199,0.000058,0.289277
4,21,[5g],"[wiki_edits_monthly_count, news_Israel]",2,0.000199,0.000058,0.289277
...,...,...,...,...,...,...,...
1190,149135,"[streaming television, reusable launch system, miniaturized satellite]","[wiki_edits_monthly_count, news_China]",2,0.000049,0.000010,0.205252
1191,149145,"[streaming television, reusable launch system, miniaturized satellite]","[wiki_edits_monthly_max_size, news_China]",2,0.007977,0.000067,0.008461
1192,149134,"[streaming television, reusable launch system, miniaturized satellite]","[wiki_edits_monthly_count, wiki_edits_monthly_max_size]",2,0.009172,0.000590,0.064291
1193,149200,"[streaming television, reusable launch system, miniaturized satellite]","[wiki_edits_monthly_count, wiki_edits_monthly_max_size, news_China]",3,0.009192,0.000061,0.006603


In [62]:
subsets=df_metric.loc[conds,"subset"].head(top_n).values

In [63]:
step=max(df_metric.loc[conds,"length"].head(top_n).values)
step

3

In [65]:
#manualy define the resolution for each feature step
step=4
step=1/step
start=0+step
stop=1+step
weights=np.arange (start, stop, step)
weights

array([0.25, 0.5 , 0.75, 1.  ])

In [68]:
#for each anchor/features subset pairs calculate the best weigth for each feature from the given feature subset
#to avoid overfit we filter by the ratio of the normal score to the overfit score

%%time
cost_weighted_list=[]
j=0
for row in df_temp_metric_ver_2.iterrows():
    print(row[0])
    j+=1
    subset=row[1]["subset"]
    for weight in itertools.permutations(weights, len(subset)):
        cost=[]
        of_cost=[]
        for name in row[1]["anchor_list"]:
            sales=[]
            j=0
            for i in subset:
                df=f_lst[i]
#                 print(df.loc[df.index.str.lower()==name.lower(),:].shape)
                if df.loc[df.index.str.lower()==name.lower(),:].shape[0]>0:
                    sales.append(df.loc[df.index.str.lower()==name.lower(),:].values[0]*weight[j])
                    j+=1
            sales=np.add.reduce(sales)
            if sales.sum()>0:
                sales=np.trim_zeros(sales, 'f').astype(float)
                sales=np.convolve(sales, np.ones(N)/N, mode='valid')
                t= np.linspace(1.0, sales.shape[0], num=sales.shape[0])
                cost.append(plot_bass(sales,t,name,plot=False))
                of_cost.append(plot_bass(sales,t,name,plot=False,overfit=True))
            else:
                errors.append(f"subset:{subset} name:{name}")
                cost.append(10.0)
                of_cost.append(10.0)
        if cost:
            cost_weighted_list.append({"anchor_list":row[1]["anchor_list"],"subset":subset,"cost":statistics.median(cost),"overfit cost":statistics.median(of_cost),"weights":weight,"length":len(subset)})

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [69]:
#sort dataframe and save
df_w_metric=pd.DataFrame(cost_weighted_list).sort_values(by=["cost","length"])
df_w_metric["ratio"]=df_w_metric["overfit cost"]/df_w_metric["cost"]
pd.set_option('display.max_colwidth', 0)
# conds=(df_w_metric["ratio"]>0.2) & (df_w_metric["length"]>1)
df_w_metric

Unnamed: 0,anchor_list,subset,cost,overfit cost,weights,length,ratio
1380,[capsule endoscopy],"[wiki_edits_monthly_count, news_R&D]",2.207747e-07,7.626780e-08,"(0.25, 0.5)",2,0.345455
1381,[capsule endoscopy],"[wiki_edits_monthly_count, news_R&D]",2.207747e-07,7.626780e-08,"(0.25, 0.75)",2,0.345455
1382,[capsule endoscopy],"[wiki_edits_monthly_count, news_R&D]",2.207747e-07,7.626780e-08,"(0.25, 1.0)",2,0.345455
1392,[capsule endoscopy],"[wiki_edits_monthly_count, scientific_emergence]",2.207747e-07,7.626780e-08,"(0.25, 0.5)",2,0.345455
1393,[capsule endoscopy],"[wiki_edits_monthly_count, scientific_emergence]",2.207747e-07,7.626780e-08,"(0.25, 0.75)",2,0.345455
...,...,...,...,...,...,...,...
10503,"[open source model, miniaturized satellite, electric vehicle, cloud computing, electronic cigarette]","[news_R&D, news_China, Finance_14112021210413.]",3.358826e-01,4.735256e-03,"(0.25, 0.75, 1.0)",3,0.014098
10507,"[open source model, miniaturized satellite, electric vehicle, cloud computing, electronic cigarette]","[news_R&D, news_China, Finance_14112021210413.]",3.358826e-01,4.735256e-03,"(0.5, 0.25, 1.0)",3,0.014098
10509,"[open source model, miniaturized satellite, electric vehicle, cloud computing, electronic cigarette]","[news_R&D, news_China, Finance_14112021210413.]",3.358826e-01,4.735256e-03,"(0.5, 0.75, 1.0)",3,0.014098
10513,"[open source model, miniaturized satellite, electric vehicle, cloud computing, electronic cigarette]","[news_R&D, news_China, Finance_14112021210413.]",3.358826e-01,4.735256e-03,"(0.75, 0.25, 1.0)",3,0.014098


In [70]:
with open('weighted_features_10t.pickle', 'wb') as handle:
    pickle.dump(df_w_metric, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
#saving with diffrent sorting parameters, can be ignored

In [None]:
df_w_metric_v2=pd.concat([cost_frame,df_w_metric])
df_w_metric_v2=df_w_metric_v2.sort_values(by=["cost","ratio","length"],ascending=[True,False,False])
df_w_metric_v2['anchor_list_tup']  = df_w_metric_v2['anchor_list'].apply(lambda x : tuple(x) if type(x) is list else x)
df_w_metric_v2['subset_tup']  = df_w_metric_v2['subset'].apply(lambda x : tuple(x) if type(x) is list else x)
df_w_metric_v2=df_w_metric_v2.drop_duplicates(["anchor_list_tup","subset_tup","length"])

In [None]:
df_list=[]

In [None]:
df1=df_w_metric.sort_values(by=["cost","ratio","length"],ascending=[True,False,False])
df1['anchor_list_tup']  = df1['anchor_list'].apply(lambda x : tuple(x) if type(x) is list else x)
df1['subset_tup']  = df1['subset'].apply(lambda x : tuple(x) if type(x) is list else x)
df1=df1.drop_duplicates(["anchor_list_tup","subset_tup","length"])
df1=df1.sort_values(by=["cost","ratio","length"],ascending=[True,False,False])
for anchor_list in tqdm(anchor_subset):
    df_list.append(df1[df1["anchor_list"].astype(str)==str(anchor_list)])

In [None]:
df1=cost_frame.sort_values(by=["cost","ratio","length"],ascending=[True,False,False])
df1['anchor_list_tup']  = df1['anchor_list'].apply(lambda x : tuple(x) if type(x) is list else x)
df1['subset_tup']  = df1['subset'].apply(lambda x : tuple(x) if type(x) is list else x)
df1=df1.drop_duplicates(["anchor_list_tup","subset_tup","length"])
df1=df1.sort_values(by=["cost","ratio","length"],ascending=[True,False,False])
for anchor_list in tqdm(anchor_subset):
    df_list.append(df1[df1["anchor_list"].astype(str)==str(anchor_list)])

In [None]:
df_w_metric_v2=pd.concat(df_list)
df_w_metric_v2=df_w_metric_v2.sort_values(by=["cost","ratio","length"],ascending=[True,False,False])
# df_w_metric_v2['anchor_list']  = df_w_metric_v2['anchor_list'].apply(lambda x : tuple(x) if type(x) is list else x)
# df_w_metric_v2['subset']  = df_w_metric_v2['subset'].apply(lambda x : tuple(x) if type(x) is list else x)
# df_w_metric_v2.drop_duplicates(["anchor_list","subset","length"])
df_w_metric_v2

In [None]:
with open('weighted_features_2.pickle', 'wb') as handle:
    pickle.dump(df_w_metric_v2, handle, protocol=pickle.HIGHEST_PROTOCOL)

## 

In [None]:
cost_frame={}
with open('weighted_features_wit_spark.pickle', 'rb') as handle:
    cost_frame = pickle.load(handle)

In [None]:
cost_frame

In [None]:
%%time
errors=[]
# subsets=
cost_weighted_list=[]
#     try:
N=60

ii=0


subset=df_w_metric.loc[conds,"subset"].values[ii]
weight=df_w_metric.loc[conds,"weights"].values[ii]

print(subset)
print(weight)
cost=[]
of_cost=[]
final_bass=0
pqm=[]
err=100
for name in anchors:
    sales=[]
    j=0
    for w,s in zip(weight,subset):
        df=f_lst[s]
        sales.append(df.loc[df.index.str.lower()==name.lower(),:].values[0]*w)
        j+=1
#             sales.append(df.loc[df.index.str.lower()==name.lower(),:].values[0])
    sales=np.add.reduce(sales)
    if sales.sum()>0:
        sales=np.trim_zeros(sales, 'f').astype(float)
        sales=np.convolve(sales, np.ones(N)/N, mode='valid')
        t= np.linspace(1.0, sales.shape[0], num=sales.shape[0])
        temp_final_bass,temp_pqm,temp_err=plot_bass(sales,t,name,graph=True)
        print(temp_final_bass.shape)
        peaks, _ = find_peaks(temp_final_bass, height=0)
        if peaks:
            if temp_err<err:
                final_bass=temp_final_bass
                err=temp_err

    else:
        errors.append(f"subset:{subset} name:{name}")

In [None]:
g_anchor_bass=scaler.fit_transform(final_bass.reshape(-1, 1))
g_anchor_bass=(g_anchor_bass-g_anchor_bass[1][0])/(1-g_anchor_bass[1][0])


In [None]:
tech_name="hydroponics"

In [None]:
g_tech=[]
for w,s in zip(weight,subset):
        df=f_lst[s]
        print()
        g_tech.append(df.loc[df.index.str.lower()==tech_name.lower(),:].values[0]*w)
        j+=1
g_tech=np.add.reduce(g_tech)
g_tech=np.trim_zeros(g_tech, 'f').astype(float)
g_tech=np.convolve(g_tech, np.ones(N)/N, mode='valid')
t= np.linspace(1.0, g_tech.shape[0], num=g_tech.shape[0])
plt.plot(g_tech)
g_tech=scaler.fit_transform(g_tech.reshape(-1, 1))

In [None]:
test_case=[
    "artificial brain", #mid diffusion
    "asteroid mining",#* good fit low diffusion
    "cortical implant", #lack of inforamtion
    "neural networks",#problem with anchors
    "dna computing", #**right time wrong shape
    "hydroponics", #* 
    "smart grid", #already diffused
    "gas generator", #* 66% diffused
    "medicinal fungi", #* right answer
    "inflatable space habitat", #diffused
    "carbon nanotube",#diffused
    "holography", #misift
    "electronic nose"
]

tech_name="gas generator"
print(techs_dict[tech_name])
# f_lst["wiki_views."].loc[tech_name].plot()

In [None]:
fit_x=0
fit_y=0
diff=0.1
steps=np.arange(0.5,1+diff,diff)
sim=np.inf

for i in tqdm(range(12,len(g_anchor_bass)-len(g_tech))):
    diff=0.1
    start=0.3
    
    low_fit_flag=True
    while low_fit_flag:
        
        steps=np.arange(start,1+diff,diff)
        for step in steps:
            a=g_tech[:,0].copy()
            a=np.interp(np.arange(0, len(a), len(a)/(len(a)+i)), np.arange(0, len(a)), a)
            a=a*step
            result = np.mean(np.square(a - g_anchor_bass[:len(a),0]))
            if sim>result:
                fit_x=i
                fit_y=step
                sim=result
        if fit_y>(start+diff*2):
            low_fit_flag=False
        else:
            start=start+diff
        if start>1:
            break
print(fit_x,fit_y)
a=g_tech[:,0].copy()*fit_y
a=np.interp(np.arange(0, len(a), len(a)/(len(a)+fit_x)), np.arange(0, len(a)), a)

In [None]:
res=1 - spatial.distance.cosine(a , g_anchor_bass[:len(a),0])
print(f"Tech: {tech_name} ,MSE:{sim} Cosine similarity: {res}")
plt.plot(g_anchor_bass)
plt.plot(a)

In [None]:
res=1 - spatial.distance.cosine(a , g_anchor_bass[:len(a),0])
print(f"Tech: {tech_name} ,MSE:{sim} Cosine similarity: {res}")
plt.plot(g_anchor_bass)
plt.plot(a)

In [None]:
res=1 - spatial.distance.cosine(a , g_anchor_bass[:len(a),0])
print(f"Tech: {tech_name} ,MSE:{sim} Cosine similarity: {res}")
plt.plot(g_anchor_bass)
plt.plot(a)

In [None]:
res=1 - spatial.distance.cosine(a , g_anchor_bass[:len(a),0])
print(f"Tech: {tech_name} ,MSE:{sim} Cosine similarity: {res}")
plt.plot(g_anchor_bass)
plt.plot(a)

In [None]:
res=1 - spatial.distance.cosine(a , g_anchor_bass[:len(a),0])
print(f"Tech: {tech_name} ,MSE:{sim} Cosine similarity: {res}")
plt.plot(g_anchor_bass)
plt.plot(a)

In [None]:
res=1 - spatial.distance.cosine(a , g_anchor_bass[:len(a),0])
print(f"Tech: {tech_name} ,MSE:{sim} Cosine similarity: {res}")
plt.plot(g_anchor_bass)
plt.plot(a)

In [None]:
tech_name