In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy
import xlrd
import plotly.graph_objs as go
import functools

In [2]:
df = pd.read_excel("2003-2016 Seed Potato Cert data v20191204_NO FL lines_Rioux 5AUG2020.xlsx", sheet_name="2003-2016 Seed Potato Cert")

In [3]:
df.shape

(7502, 147)

In [4]:
df.head()

Unnamed: 0,SummerID,CY,CERT_N,LNAME,SNAME,GCODE,VARIETY,VAR,V2,V3AR,...,winter_TBR,winter_TOTV,winter_CLASS,winter_GEN,winter_PAYING,winter_SF_PROG,winter_FY,winter_DIP,winter_AC_REJ,winter_CY
0,227,2007,107150,"Bula Potato Farms, Inc.",Bula Po,114,1975-11-15 00:00:00,1975-11-15 00:00:00,1975-11-15 00:00:00,1975-11-15 00:00:00,...,,0.0,,,,75-11-15 Summit Labs 2007,1.0,N,,2007.0
1,272,2003,103207,"Bula Potato Farms, Inc.",Bula Po,114,A-88338-1,A-88338-1,A-88338-1,A-88338-1,...,,0.0,,,,A-88338-1 Manhattan 2003,4.0,,0.0,2003.0
2,282,2003,103217,"Bula Potato Farms, Inc.",Bula Po,114,A-90586-11,A-90586-11,A-90586-11,A-90586-11,...,,3.75,Certified,,,A-90586-11 Manhattan 2003,4.0,,0.0,2003.0
3,3,2009,109003,"Guenthner Farms, Inc.",Guen Farm,149,A-91814-5,A-91814-5,A-91814-5,A-91814-5,...,,0.0,Foundation,FY-2,,A-91814-5 State Farm 2009,,N,,2009.0
4,10,2009,109003,"Guenthner Farms, Inc.",Guen Farm,149,A-91814-5,A-91814-5,A-91814-5,A-91814-5,...,,0.0,Foundation,FY-2,,A-91814-5 State Farm 2009,,N,,2009.0


### Data Validation

#### Inspection day

Hypothesis: **DASP1** should be smaller than **DASP2**

In [5]:
df.loc[df["DAPS1"] > df["DAPS2"], df.columns.str.contains("DA")]

Unnamed: 0,DATE_1ST,DAPS1,DATE_2ND,DAPS2
6182,6/30/2005,38,6/16/2005,24


#### Source Year

Hypothesis: **CY** should be larger than or equal to **S_YR** <br>

-CY: year entered certification program <br>
-S_YR: source year 

In [34]:
df[(df["CY"] <df["S_YR"]) | (df["CY"] <df["winter_S_YR"])]

Unnamed: 0,SummerID,CY,CERT_N,LNAME,SNAME,GCODE,VARIETY,VAR,V2,V3AR,...,winter_TBR,winter_TOTV,winter_CLASS,winter_GEN,winter_PAYING,winter_SF_PROG,winter_FY,winter_DIP,winter_AC_REJ,winter_CY


#### Check one-to-one relationship between source grower and source grower code

Hypothesis: One source grower should only have one source grower code

In [7]:
# One source code correspond 5 different source grower

grw_count = df[["S_GRW","S_GCODE"]].groupby("S_GCODE").apply(lambda x: x["S_GRW"].nunique())
grw_count[grw_count > 2]

S_GCODE
243    5
dtype: int64

In [8]:
# It seems like these 5 source grower are actually one source grower in different format
df[df["S_GCODE"] == 243]["S_GRW"].unique()

array(['Felix Zeloski Farms-Eagle River', 'Eagle River Seed Farm LLC',
       ' Zeloski Farms-Eagle River', 'Zeloski Farms-Eagle River',
       'Zeloski, Felix Farms-Eagle River'], dtype=object)

In [9]:
# Standardize name: 'Felix Zeloski Farms-Eagle River'
prob_name = df[df["S_GCODE"] == 243]["S_GRW"].unique().tolist()
prob_name
df["S_GRW"] = df["S_GRW"].apply(lambda x: 'Felix Zeloski Farms-Eagle River' if x in prob_name else x)

# Double check after fixing the problem
grw_count = df[["S_GRW","S_GCODE"]].groupby("S_GCODE").apply(lambda x: x["S_GRW"].nunique())
grw_count[grw_count > 2]

Series([], dtype: int64)

In [10]:
# Three source growers have 2 source grower code

gcode_count = df[["S_GRW","S_GCODE"]].groupby("S_GRW").apply(lambda x: x["S_GCODE"].nunique())
gcode_count[gcode_count > 1]



S_GRW
CSS                         2
Jorde Certified Seed LLC    2
Thompson Seed Potatoes      2
dtype: int64

In [11]:
# Look into these three problematic grower
prob_grower = gcode_count[gcode_count > 1].index.tolist()
prob_grower

for grower in prob_grower:
    print(str(grower) +":" + str(df[df["S_GRW"] == grower]["S_GCODE"].unique()))
    print()
# prob_grw = gcode_count[gcode_count > 1].index

CSS:['co26' 'ne06']

Jorde Certified Seed LLC:['nd14' nan 'nd44']

Thompson Seed Potatoes:['ne07' 'ne20']



#### Possible redundant info for summer and winter

Summer info and winter info should match for source info

In [12]:
summer_columns = ["CERT_N",
"SNAME",
"GCODE",
"VARIETY",
"S_GRW",
"S_G",
"S_YR",
"S_GCODE",
"S_STATE"]

winter_columns = ["winter_{}".format(x) for x in summer_columns]
winter_columns

combined_columns =[]
for i in range(len(summer_columns)):
    combined_columns.append(summer_columns[i])
    combined_columns.append(winter_columns[i])
    
combined_columns

['CERT_N',
 'winter_CERT_N',
 'SNAME',
 'winter_SNAME',
 'GCODE',
 'winter_GCODE',
 'VARIETY',
 'winter_VARIETY',
 'S_GRW',
 'winter_S_GRW',
 'S_G',
 'winter_S_G',
 'S_YR',
 'winter_S_YR',
 'S_GCODE',
 'winter_S_GCODE',
 'S_STATE',
 'winter_S_STATE']

In [29]:
df[combined_columns]

Unnamed: 0,CERT_N,winter_CERT_N,SNAME,winter_SNAME,GCODE,winter_GCODE,VARIETY,winter_VARIETY,S_GRW,winter_S_GRW,S_G,winter_S_G,S_YR,winter_S_YR,S_GCODE,winter_S_GCODE,S_STATE,winter_S_STATE
0,107150,107150.0,Bula Po,Bula Po,114,114.0,1975-11-15 00:00:00,1975-11-15 00:00:00,"Summit Plant Labs, Inc.","Summit Plant Labs, Inc.",Summit Labs,Summit Labs,2007,2007.0,co02,co02,CO,CO
1,103207,103207.0,Bula Po,Bula Po,114,114.0,A-88338-1,A-88338-1,Kamps Seed Farm,Kamps Seed Farm,Manhattan,Manhattan,2003,2003.0,mt14,mt14,MT,MT
2,103217,103217.0,Bula Po,Bula Po,114,114.0,A-90586-11,A-90586-11,Kamps Seed Farm,Kamps Seed Farm,Manhattan,Manhattan,2003,2003.0,mt14,mt14,MT,MT
3,109003,109003.0,Guen Farm,Guen Farm,149,149.0,A-91814-5,A-91814-5,State Farm,State Farm,State Farm,State Farm,2009,2009.0,0,0,WI,WI
4,109003,109003.0,Guen Farm,Guen Farm,149,149.0,A-91814-5,A-91814-5,State Farm,State Farm,State Farm,State Farm,2009,2009.0,0,0,WI,WI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7497,110462,110462.0,Northsand,Northsand,196,196.0,Zebra,Zebra,"CETS, LLC Astrotuber","CETS, LLC Astrotuber",CETS,CETS,2008,2008.0,wi,wi,WI,WI
7498,108292,108292.0,Bula Po,Bula Po,114,114.0,Zebra,Zebra,"CETS, LLC Astrotuber","CETS, LLC Astrotuber",CETS,CETS,2008,2008.0,wi,wi,WI,WI
7499,108369,108369.0,Northsand,Northsand,196,196.0,Zebra,Zebra,"CETS, LLC Astrotuber","CETS, LLC Astrotuber",CETS,CETS,2008,2008.0,wi,wi,WI,WI
7500,108291,108291.0,Bula Po,Bula Po,114,114.0,Zeus,Zeus,"CETS, LLC Astrotuber","CETS, LLC Astrotuber",CETS,CETS,2008,2008.0,wi,wi,WI,WI


##### Check the reason for mismatch

Some of mismatch result from missing value

Solution: fill the missing value by its equivalent

In [14]:
conditions = [(df.loc[:, combined_columns[i]] != df.loc[:,combined_columns[i+1]]) for i in range(0,len(combined_columns),2)]
conditions

df.loc[conditions[0] | conditions[1] | conditions[2] | conditions[3] | conditions[4] | conditions[5] | conditions[6] | conditions[7] | conditions[8],combined_columns
      ] 

Unnamed: 0,CERT_N,winter_CERT_N,SNAME,winter_SNAME,GCODE,winter_GCODE,VARIETY,winter_VARIETY,S_GRW,winter_S_GRW,S_G,winter_S_G,S_YR,winter_S_YR,S_GCODE,winter_S_GCODE,S_STATE,winter_S_STATE
10,110329,,Bula Po,,114,,A008-ITE,,UI/Teutonia,,UI/Teutonia,,2010,,id01,,ID,
11,115135,115135.0,Bula Po,Bula Po,114,114.0,A02507-2LB - Payette Ru,A02507-2LB,UI/Teutonia,UI/Teutonia,UI/Teutonia,UI/Teutonia,2015,2015.0,id01,id01,ID,ID
13,112348,112348.0,Bula Po,Bula Po,114,114.0,A95109-1RUS,A95409-1RUS,UI/Teutonia,UI/Teutonia,UI/Teutonia,UI/Teutonia,2010,2010.0,id01,id01,ID,ID
16,110333,,Bula Po,,114,,A95409-1RUS,,UI/Teutonia,,UI/Teutonia,,2010,,id01,,ID,
35,116158,116158.0,Mattek,Mattek,190,190.0,Accumulator,Accumulator,Felix Zeloski Farms-Eagle River,Eagle River Seed Farm LLC,Eagle River Se,Eagle River Se,2016,2016.0,243,243,WI,WI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7453,110223,,Vermont,,340,,Yukon Gold,,State Farm,,State Farm,,2009,,0,,WI,
7472,108351,108351.0,Fleischman,Fleischman,133,133.0,Yukon Gold,Yukon Gold,State Farm,State Farm,State Farm,State Farm,1072,2006.0,0,0,WI,WI
7473,108351,108351.0,Fleischman,Fleischman,133,133.0,Yukon Gold,Yukon Gold,State Farm,State Farm,State Farm,State Farm,1072,2006.0,0,0,WI,WI
7495,110452,,Northsand,,196,,Zebra,,"CETS, LLC Astrotuber",,CETS,,2010,,wi,,WI,


Fill na value in either summer or winter columns by the other one

In [40]:
for i in range(0, len(combined_columns),2):
    df[combined_columns[i]] = df[combined_columns[i]].fillna(df[combined_columns[i+1]])
    df[combined_columns[i]] = df[combined_columns[i]].mask(df[combined_columns[i]]==0).fillna(df[combined_columns[i+1]])

for i in range(1, len(combined_columns),2):
    df[combined_columns[i]] = df[combined_columns[i]].fillna(df[combined_columns[i-1]])
    df[combined_columns[i]] =df[combined_columns[i]].mask(df[combined_columns[i]]==0).fillna(df[combined_columns[i-1]])


##### Mismatch Analysis

- Reason1: Typo 
- Reason2: full name and shorthand notation in variety
- Reason3: Extra comma (source grower)
 

In [41]:
new_conditions = [(df.loc[:, combined_columns[i]] != df.loc[:,combined_columns[i+1]]) for i in range(0,len(combined_columns),2)]
new_conditions

df.loc[new_conditions[0] | new_conditions[1] | new_conditions[2] | new_conditions[3] | new_conditions[4] | new_conditions[5] | new_conditions[6] | new_conditions[7] | new_conditions[8],combined_columns
      ] 

Unnamed: 0,CERT_N,winter_CERT_N,SNAME,winter_SNAME,GCODE,winter_GCODE,VARIETY,winter_VARIETY,S_GRW,winter_S_GRW,S_G,winter_S_G,S_YR,winter_S_YR,S_GCODE,winter_S_GCODE,S_STATE,winter_S_STATE
11,115135,115135.0,Bula Po,Bula Po,114,114.0,A02507-2LB - Payette Ru,A02507-2LB,UI/Teutonia,UI/Teutonia,UI/Teutonia,UI/Teutonia,2015,2015.0,id01,id01,ID,ID
13,112348,112348.0,Bula Po,Bula Po,114,114.0,A95109-1RUS,A95409-1RUS,UI/Teutonia,UI/Teutonia,UI/Teutonia,UI/Teutonia,2010,2010.0,id01,id01,ID,ID
35,116158,116158.0,Mattek,Mattek,190,190.0,Accumulator,Accumulator,Felix Zeloski Farms-Eagle River,Eagle River Seed Farm LLC,Eagle River Se,Eagle River Se,2016,2016.0,243,243,WI,WI
42,115254,115254.0,Bula Po,Bula Po,114,114.0,Actrice,Actrice,Phytocultures Ltd.,Phytocultures Ltd.,Phytocu,Phytocu,2015,2015.0,pei05,pei05,PEI,PE
127,104379,104379.0,Gall Dave,Gall Dave,145,145.0,All Blue,All Blue,Uihlein Farm,Uihlein Farm,Uihlein Fm,Uihlein Fm,2000,2002.0,ny01,ny01,NY,NY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7263,107152,107152.0,Bula Po,Bula Po,114,114.0,White Pearl,White Pearl,"CETS, LLC Astrotuber","CETSTECH, LLC",CETS,Cets,2007,2007.0,wi,wi,WI,WI
7306,103418,103418.0,Vermont,D. Perkins,340,340.0,Yukon Gold,Yukon Gold,State Farm,State Farm,State Farm,State Farm,2003,2003.0,0,0,WI,WI
7394,113057,113057.0,Bula Po,Bula Po,114,114.0,Yukon Gold,Yukon Gold,James Gallenberg,James Gallenberg,J Gallenberg,Gallenberg J,2013,2013.0,142,142,WI,WI
7472,108351,108351.0,Fleischman,Fleischman,133,133.0,Yukon Gold,Yukon Gold,State Farm,State Farm,State Farm,State Farm,1072,2006.0,0,0,WI,WI


In [42]:
# Number of mismatch after filling missing value

for i in range(len(new_conditions)):
    print(combined_columns[i*2] + ":" + str(len(df.loc[new_conditions[i], combined_columns])))

CERT_N:0
SNAME:125
GCODE:1
VARIETY:109
S_GRW:91
S_G:59
S_YR:62
S_GCODE:23
S_STATE:11


In [43]:
df.loc[new_conditions[2],combined_columns] 

Unnamed: 0,CERT_N,winter_CERT_N,SNAME,winter_SNAME,GCODE,winter_GCODE,VARIETY,winter_VARIETY,S_GRW,winter_S_GRW,S_G,winter_S_G,S_YR,winter_S_YR,S_GCODE,winter_S_GCODE,S_STATE,winter_S_STATE
2716,114558,114558.0,Rineridge,Rineridge,ny01,259.0,Lamoka,Lamoka,Cornell Uihlein Farm,Cornell Uihlein Farm,Uihlein Fm,Uihlein Fm,2014,2014.0,ny01,ny01,NY,NY


In [18]:
def conjunction(*conditions):
    return functools.reduce(np.logical_or, conditions)

condition
np.lo

source_info[conjunction(condition[0], condition[1])]

NameError: name 'condition' is not defined

In [None]:
df.loc[:, (df.columns.str.contains("SR2")) | (df.columns.str.contains("SR1")) ]


In [None]:
frequent_state = df["S_STATE"].value_counts()[:8].index.to_list()
frequent_state

In [None]:
df.columns[df.columns.str.contains("SR1")]

In [None]:
target_virus = ['SR1_ST','SR1_MIX',"SR1_LR"]

In [None]:
fig, ax = plt.subplots(figsize = (10,6))
df[df["S_STATE"].isin(frequent_state)].groupby("S_STATE").mean()[target_virus].plot(kind = "barh", ax = ax)
ax.tick_params(axis='x', rotation=0)
ax.set_title("Virus across state")
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.tight_layout()

plt.savefig("Virus_across_state")

In [None]:
frequent_variety = df["VARIETY"].value_counts()[:15].index.tolist()
frequent_variety

In [None]:
fig, ax = plt.subplots(figsize = (10,6))
df[df["VARIETY"].isin(frequent_variety)].groupby("VARIETY").mean()[target_virus].plot(kind = "barh", ax = ax)
ax.tick_params(axis='x', rotation=0)
ax.tick_params(axis="y", rotation = 0)
ax.set_title("Virus across potato variety")
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.tight_layout()
plt.savefig("Virus_across_potato_variety")