### This 

In [14]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression

In [15]:
encoded_file = '../processed_data/training.pkl'

### Reading training data

In [16]:
#Reading training file

data = pd.read_pickle(encoded_file)
data.head()

Unnamed: 0,GID,1A_1145442,1A_1158042,1A_1158055,1A_1230946,1A_1232964,1A_1236254,1A_1238074,1A_1238114,1A_1588248,...,UN_473655057,UN_473655073,UN_474138840,UN_474138845,UN_474138866,UN_474767314,UN_475092207,UN_475092295,UN_476929267,UN_476929292
0,GID6680533,-1,-1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,GID6681889,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,1,1,1,1
4,GID7401114,0,-1,1,1,1,1,1,1,1,...,1,1,0,0,1,-1,1,1,1,1
6,GID6415858,1,1,1,1,1,1,1,1,1,...,1,1,0,0,1,1,1,1,1,1
7,GID7629157,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [17]:
data.shape

(2199, 40768)

### Mapping GID with CID and SID to integrate env

In [18]:
# reading all gid and cid

base_dir = '../gid_cid_map/'
all_gid_cid = pd.DataFrame()
with os.scandir(base_dir) as prime_dir:
    for entry in prime_dir:
        x = pd.read_csv(entry)
        all_gid_cid = all_gid_cid.append(x, ignore_index = True)
all_gid_cid['GID'] = 'GID' + all_gid_cid['GID'].astype('str')

In [19]:
all_gid_cid.head()

Unnamed: 0,CID,SID,GID
0,61665,1,GID304660
1,525424,84,GID6341870
2,557028,13,GID6933502
3,568630,33,GID7400769
4,583298,79,GID8048669


In [20]:
#mergin gid, cid and sid with genotype data

data_combined = all_gid_cid.merge(data, how='inner', on='GID')

In [21]:
data_combined.drop_duplicates(inplace=True)

In [22]:
data_combined.head()

Unnamed: 0,CID,SID,GID,1A_1145442,1A_1158042,1A_1158055,1A_1230946,1A_1232964,1A_1236254,1A_1238074,...,UN_473655057,UN_473655073,UN_474138840,UN_474138845,UN_474138866,UN_474767314,UN_475092207,UN_475092295,UN_476929267,UN_476929292
0,61665,1,GID304660,1,1,1,1,1,1,1,...,1,1,0,1,1,0,1,1,1,1
255,525424,84,GID6341870,1,1,1,1,1,1,1,...,1,1,1,1,1,1,0,0,1,1
263,583298,79,GID8048669,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
266,584975,34,GID8049754,1,1,1,1,1,1,1,...,1,1,1,1,1,-1,0,0,1,1
268,585095,93,GID8050267,1,1,1,1,1,1,1,...,1,1,1,1,1,-1,-1,-1,1,1


In [23]:
data_combined.shape

(2199, 40770)

### Reading yield data for all locations

In [24]:


base_dir = '../wheat_all/done/'
yield_data = pd.DataFrame()
with os.scandir(base_dir) as prime_dir:
    for entry in prime_dir:
        trial_name = entry.name
        yield_file = base_dir + trial_name + '/GrnYld.xls'
        x = pd.read_csv(yield_file, delimiter='\t')
        x['trial'] = trial_name[2:]
        yield_data = yield_data.append(x, ignore_index = True)


In [25]:
yield_data.shape

(229357, 26)

In [26]:
def is_float(x):
    try:
        float(x)
    except ValueError:
        return False
    return True


In [27]:
yield_data = yield_data[yield_data.Value.apply(lambda x: is_float(x))]

In [28]:
yield_data.shape

(204742, 26)

In [29]:
yield_data.head()

Unnamed: 0,Trial name,Occ,Loc_no,Country,Loc_desc,Cycle,Cid,Sid,Gen_name,Trait_no,...,GID,Plot,Entry,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
0,17TH HIGH RAINFALL WHEAT YT,1,51009,ARGENTINA,PARANA,2009,61665,1,LOCAL CHECK,144,...,,,,,,,,,,
1,17TH HIGH RAINFALL WHEAT YT,1,51009,ARGENTINA,PARANA,2009,7414,205,PROINTA FEDERAL,144,...,,,,,,,,,,
2,17TH HIGH RAINFALL WHEAT YT,1,51009,ARGENTINA,PARANA,2009,88241,1,KLEIN CACIQUE,144,...,,,,,,,,,,
3,17TH HIGH RAINFALL WHEAT YT,1,51009,ARGENTINA,PARANA,2009,444913,1,FUNDACEP 30,144,...,,,,,,,,,,
4,17TH HIGH RAINFALL WHEAT YT,1,51009,ARGENTINA,PARANA,2009,7896,410,HUAYUN INIA,144,...,,,,,,,,,,


In [30]:
yield_data['trial']

0         HRWYT
1         HRWYT
2         HRWYT
3         HRWYT
4         HRWYT
          ...  
229252    IBWSN
229253    IBWSN
229254    IBWSN
229255    IBWSN
229256    IBWSN
Name: trial, Length: 204742, dtype: object

In [31]:
yield_data.shape

(204742, 26)

In [32]:
yield_data.columns

Index(['Trial name', 'Occ', 'Loc_no', 'Country', 'Loc_desc', 'Cycle', 'Cid',
       'Sid', 'Gen_name', 'Trait_no', 'Trait name', 'Value', 'EMS', 'SE',
       'Unit', 'trial', 'GID', 'Plot', 'Entry', 'Unnamed: 17', 'Unnamed: 18',
       'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [33]:
yield_data['Cycle']

0         2009
1         2009
2         2009
3         2009
4         2009
          ... 
229252    2015
229253    2015
229254    2015
229255    2015
229256    2015
Name: Cycle, Length: 204742, dtype: int64

In [34]:
def calc_average(x):
    x = np.array(x, dtype=float)
    avg = np.average(x)
    
    return avg

### Rearranging yield data by trial, cid, sid, location and cycle

In [35]:


yield_data_groupby = yield_data.groupby(['Cid', 'Sid', 'Loc_no', 'trial', 'Cycle']).agg({
    'Value': lambda x: calc_average(x)
})

In [36]:
yield_data_groupby.reset_index(inplace=True)

In [37]:
yield_data_groupby.head()

Unnamed: 0,Cid,Sid,Loc_no,trial,Cycle,Value
0,1382,1,10002,IBWSN,2004,3.2
1,1382,1,10026,IBWSN,2003,7.41
2,1382,1,10029,IBWSN,2004,4.773
3,1382,1,10032,IBWSN,2004,2.529
4,1382,1,11011,IBWSN,2005,1.864


In [38]:
yield_data_groupby.shape

(191804, 6)

### Mapping GID, CID and SID with environment specific yield by location and cycle

In [39]:
locations = yield_data_groupby['Loc_no'].unique()
cycles = yield_data_groupby['Cycle'].unique()

In [40]:
print(len(locations))
print(len(cycles))

390
17


In [41]:
yield_by_env = pd.DataFrame()

gid_cid = data_combined.loc[:, ['GID', 'CID', 'SID']]
print(gid_cid.shape)

(2199, 3)


In [42]:
for index, row in gid_cid.iterrows():
    d = {}
    d['GID'] = str(row['GID'])
    d['CID'] = str(row['CID'])
    d['SID'] = str(row['SID'])
  
    specific_loc_yield = yield_data_groupby[
        np.logical_and(yield_data_groupby['Cid'] == row['CID'],
                       yield_data_groupby['Sid'] == row['SID'])]
    for ind, y_row in specific_loc_yield.iterrows():
        d[ y_row['trial'] + '_' + str(y_row['Loc_no']) + '_' + str(y_row['Cycle'])] = y_row['Value']
    yield_by_env = yield_by_env.append(d, ignore_index=True)
    

In [43]:
yield_by_env.head()

Unnamed: 0,CID,ESWYT_10002_2015,ESWYT_10009_2012,ESWYT_10026_2008,ESWYT_10026_2009,ESWYT_10026_2010,ESWYT_10026_2013,ESWYT_10026_2014,ESWYT_10026_2015,ESWYT_10401_2014,...,IBWSN_22607_2003,IBWSN_19303_2003,IBWSN_50201_2003,IBWSN_65301_2012,IBWSN_65301_2005,IBWSN_65307_2015,HRWYT_51501_2016,HRWYT_21220_2017,HRWYT_42221_2017,IBWSN_65307_2016
0,61665,3.568,12.579,4.561,5.833,5.1,6.581,5.393,5.145,3.73,...,,,,,,,,,,
1,525424,,,,,,6.348,,,,...,,,,,,,,,,
2,583298,,,,,,,,,,...,,,,,,,,,,
3,584975,,,,,,,,,,...,,,,,,,,,,
4,585095,,,,,,,,,,...,,,,,,,,,,


In [44]:
yield_by_env['avg'] = yield_by_env.drop(['CID', 'SID', 'GID'], axis = 1).mean(axis=1)

In [45]:
yield_by_env['avg']

0       5.010275
1       5.000011
2       5.020649
3       4.687880
4       4.565360
          ...   
2194    4.909309
2195    5.644088
2196    5.509803
2197    5.362700
2198    5.718943
Name: avg, Length: 2199, dtype: float64

In [46]:
weather_df = pd.read_csv('../processed_data/env_cluster.csv')
weather_df.head()

In [47]:
weather_df = weather_df['location', 'cluster']

Unnamed: 0,location,0,1,2,3,4,5,6,7,8,...,16,17,18,19,20,21,22,23,24,cluster
0,10001,17,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10002,0,0,0,0,0,0,0,0,0,...,0,0,0,0,4,0,0,0,0,9
2,10004,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,11
3,10006,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,11
4,10007,0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,11


In [None]:
yield_by_env = yield_by_env.merge(weather_df, how='inner', left_on=)

In [266]:
yield_by_env.shape

(2199, 1835)

### Finding features on average phenotype (Global Features)

In [267]:
data_combined['GID'] = data_combined['GID'].astype('str')
data_combined['CID'] = data_combined['CID'].astype('str')
data_combined['SID'] = data_combined['SID'].astype('str')

In [268]:
gid_cid = yield_by_env[['GID', 'CID', 'SID']]
data_combined = gid_cid.merge(data_combined, how='inner', on=['GID', 'CID', 'SID'])

In [269]:
data_combined.shape

(2199, 40770)

In [270]:
yield_by_env.shape

(2199, 1835)

In [271]:
mi_global = mutual_info_regression(data_combined.drop(['GID', 'CID', 'SID'], axis=1), yield_by_env['avg'])

In [272]:
# entry_greater_than_50 = 0
# for loc in locations:
#     specific_loc_yield = yield_data_groupby[yield_data_groupby['Loc_no'] == loc]
#     loc_specific_data = specific_loc_yield.merge(data_combined, how='inner', left_on=['Cid', 'Sid'], right_on=['CID', 'SID'])
#     loc_specific_data = loc_specific_data.drop(['Cid', 'Sid'], axis=1)
#     if loc_specific_data.shape[0] >= 50:
#         entry_greater_than_50 += 1
#         train_data, test_data = train_test_split(loc_specific_data, random_state=100, test_size=0.15)
        
#         train_data.to_pickle('../wheat_all/train/' + str(loc) + '.pkl')
#         test_data.to_pickle('../wheat_all/test/' + str(loc) + '.pkl')
        
    
# print('Locations with more than 50 genotypes: ', entry_greater_than_50)

In [273]:
# loc_specific_data.head()

In [274]:
selected_global_feat = mi_global > 0.01

In [275]:
print(sum(selected_global_feat))

4869


In [276]:
selected_global_feat = data_combined.drop(['GID', 'CID', 'SID'], axis=1).columns[selected_global_feat]

In [277]:
len(selected_global_feat)

4869

In [278]:
geno_without_global = data_combined.drop(selected_global_feat, axis=1)

In [279]:
geno_without_global.shape

(2199, 35901)

In [280]:
locations = yield_by_env.drop(['GID', 'CID', 'SID'], axis=1).columns

In [281]:
mi_by_loc = []

In [282]:
for location in locations:
    phenos = yield_by_env[location]
    not_null_ind = phenos.notnull()
    phenos = phenos[not_null_ind]
    loc_spec_geno =  geno_without_global.loc[not_null_ind]
    
    if loc_spec_geno.shape[0] >= 30:
        
        mi_local = mutual_info_regression(loc_spec_geno.drop(['GID', 'CID', 'SID'], axis=1), phenos)
        mi_by_loc.append(mi_local)
   
    print(sum(mi_local > 0.05))
    

2466
2721
5373
3948
3986
3152
2514
2818
3164
2585
3469
2831
3493
2303
2599
2870
2880
3043
1958
4084
2925
3046
3934
3472
2371
3499
3548
2777
2322
3004
3407
3944
2748
2639
4811
3066
2998
3562
3868
3491
2359
3367
2588
2673
2404
3032
2882
2079
3336
2170
3527
3276
2759
4322
3008
3451
3054
2538
2949
3308
2336
2911
3147
1989
2563
4311
3099
2881
3212
2320
2405
3483
2381
3301
2789
3027
2103
3101
3105
3198
3290
1914
4384
2252
3214
3732
2335
4355
2640
2357
3334
3031
3564
1756
2088
2863
3627
3005
2403
2327
2086
2363
2371
2761
2736
3549
3545
3919
2401
2637
2548
3265
3550
2164
2898
3330
3153
3171
2507
3737
2629
3553
2388
4418
2971
2974
4288
2808
2077
2464
3363
3530
2852
2579
2663
1953
2857
2318
3329
2420
2289
2969
2976
3241
3498
3278
1883
3043
2739
2680
3233
3742
2225
2465
4392
2839
3774
2594
1457
2412
3410
3950
2833
2631
3945
2836
3275
4041
2487
2455
2063
4195
2794
5294
2453
4500
3281
2710
3651
2723
4376
3039
2413
1475
4114
2850
2284
2692
3334
2405
2744
3351
3090
2894
2185
1963
2984
2618
2922
2918


In [283]:
all_local_markers = []
markers = geno_without_global.drop(['GID', 'CID', 'SID'], axis=1).columns.tolist()
markers = np.array(markers, dtype= str)
for local_mi in mi_by_loc:
#     markers = geno_without_global.drop(['GID', 'CID', 'SID'], axis=1).columns.tolist()
#     markers = np.array(markers, )
#     print(local_mi > 0.05)
    local_mi = np.array(local_mi)
    sorted_ind = np.argsort(local_mi)[::-1]
    selected_markers = markers[sorted_ind[0:200]]
    
    all_local_markers += selected_markers.tolist()
#     print(len(all_local_markers))
all_local_markers = set(all_local_markers)
# print(all_local_markers)

In [284]:
unique_local_markers = list(set(all_local_markers))
print(len(unique_local_markers))

24278
