In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import xlrd
# import plotly.graph_objs as go
import functools
import re
import plotly.graph_objects as go
pd.set_option('display.max_rows', 150)

In [2]:
df = pd.read_excel("2003-2016 Seed Potato Cert data v20191204_NO FL lines_Rioux 5AUG2020.xlsx", sheet_name="2003-2016 Seed Potato Cert")

In [3]:
df.shape

(7502, 147)

In [4]:
df.head()

Unnamed: 0,SummerID,CY,CERT_N,LNAME,SNAME,GCODE,VARIETY,VAR,V2,V3AR,...,winter_TBR,winter_TOTV,winter_CLASS,winter_GEN,winter_PAYING,winter_SF_PROG,winter_FY,winter_DIP,winter_AC_REJ,winter_CY
0,227,2007,107150,"Bula Potato Farms, Inc.",Bula Po,114,1975-11-15 00:00:00,1975-11-15 00:00:00,1975-11-15 00:00:00,1975-11-15 00:00:00,...,,0.0,,,,75-11-15 Summit Labs 2007,1.0,N,,2007.0
1,272,2003,103207,"Bula Potato Farms, Inc.",Bula Po,114,A-88338-1,A-88338-1,A-88338-1,A-88338-1,...,,0.0,,,,A-88338-1 Manhattan 2003,4.0,,0.0,2003.0
2,282,2003,103217,"Bula Potato Farms, Inc.",Bula Po,114,A-90586-11,A-90586-11,A-90586-11,A-90586-11,...,,3.75,Certified,,,A-90586-11 Manhattan 2003,4.0,,0.0,2003.0
3,3,2009,109003,"Guenthner Farms, Inc.",Guen Farm,149,A-91814-5,A-91814-5,A-91814-5,A-91814-5,...,,0.0,Foundation,FY-2,,A-91814-5 State Farm 2009,,N,,2009.0
4,10,2009,109003,"Guenthner Farms, Inc.",Guen Farm,149,A-91814-5,A-91814-5,A-91814-5,A-91814-5,...,,0.0,Foundation,FY-2,,A-91814-5 State Farm 2009,,N,,2009.0


### Data Validation

#### Inspection day

Hypothesis: **DASP1** should be smaller than **DASP2**

In [5]:
# df[["DONE_PLTG","DATE_1ST", "DATE_2ND"]] = df[["DONE_PLTG","DATE_1ST", "DATE_2ND"]].astype('datetime64[ns]')
# df[["DONE_PLTG","DATE_1ST", "DATE_2ND"]]

In [6]:
df.loc[df["DAPS1"] > df["DAPS2"], df.columns.str.contains("DA")]

Unnamed: 0,DATE_1ST,DAPS1,DATE_2ND,DAPS2
6182,6/30/2005,38,6/16/2005,24


In [7]:
# df["DATE_1ST"] - df["DONE_PLTG"]

#### Source Year

Hypothesis: **CY** should be larger than or equal to **S_YR** <br>

-CY: year entered certification program <br>
-S_YR: source year 

In [8]:
df[(df["CY"] <df["S_YR"]) | (df["CY"] <df["winter_S_YR"])]

Unnamed: 0,SummerID,CY,CERT_N,LNAME,SNAME,GCODE,VARIETY,VAR,V2,V3AR,...,winter_TBR,winter_TOTV,winter_CLASS,winter_GEN,winter_PAYING,winter_SF_PROG,winter_FY,winter_DIP,winter_AC_REJ,winter_CY


#### Check one-to-one relationship between source grower and source grower code

Hypothesis: One source grower should only have one source grower code

In [9]:
# One source code correspond 5 different source grower

grw_count = df[["S_GRW","S_GCODE"]].groupby("S_GCODE").apply(lambda x: x["S_GRW"].nunique())
grw_count[grw_count > 2]

S_GCODE
243    5
dtype: int64

In [10]:
winter_grw_count = df[["winter_S_GRW","winter_S_GCODE"]].groupby("winter_S_GCODE").apply(lambda x: x["winter_S_GRW"].nunique())
winter_grw_count[winter_grw_count > 2]

winter_S_GCODE
243    5
dtype: int64

In [11]:
grw_count = df[["S_GRW","S_GCODE"]].groupby("S_GCODE").apply(lambda x: x["S_GRW"].nunique())
grw_count[grw_count > 2]

S_GCODE
243    5
dtype: int64

In [12]:
# It seems like these 5 source grower are actually one source grower in different format
df[df["S_GCODE"] == 243]["S_GRW"].value_counts()

Felix Zeloski Farms-Eagle River     200
Eagle River Seed Farm LLC            11
Zeloski Farms-Eagle River             6
Zeloski, Felix Farms-Eagle River      2
 Zeloski Farms-Eagle River            1
Name: S_GRW, dtype: int64

In [13]:
df[df["winter_S_GCODE"] == 243]["winter_S_GRW"].value_counts()

Felix Zeloski Farms-Eagle River     196
Eagle River Seed Farm LLC            11
Zeloski Farms-Eagle River             6
Zeloski, Felix Farms-Eagle River      2
 Zeloski Farms-Eagle River            1
Name: winter_S_GRW, dtype: int64

In [14]:
# Standardize name with the most common one: 'Felix Zeloski Farms-Eagle River'
prob_name = df[df["S_GCODE"] == 243]["S_GRW"].unique().tolist()
prob_name
df["S_GRW"] = df["S_GRW"].apply(lambda x: 'Felix Zeloski Farms-Eagle River' if x in prob_name else x)
df["winter_S_GRW"] = df["winter_S_GRW"].apply(lambda x: 'Felix Zeloski Farms-Eagle River' if x in prob_name else x)

# Double check after fixing the problem
grw_count = df[["S_GRW","S_GCODE"]].groupby("S_GCODE").apply(lambda x: x["S_GRW"].nunique())
grw_count[grw_count > 2]

Series([], dtype: int64)

In [15]:
winter_grw_count = df[["winter_S_GRW","winter_S_GCODE"]].groupby("winter_S_GCODE").apply(lambda x: x["winter_S_GRW"].nunique())
winter_grw_count[winter_grw_count > 2]

Series([], dtype: int64)

In [16]:
df["S_GRW"].value_counts()

State Farm                             5172
UW Breeding Sta.                        511
Felix Zeloski Farms-Eagle River         220
Uihlein Farm                            159
Valley TCulture                         133
Summit Plant Labs, Inc.                 119
Sklarczyk Seed Farm, LLC                103
CETS, LLC Astrotuber                    102
CSS                                      97
Schroeder Bros., Farms, Inc.             90
Droge                                    80
Cornell Uihlein Farm                     66
John  Miller Farms. Inc.                 39
Wild Seed Farms, Inc                     34
Seidl Farms, Inc.                        32
J.W. Mattek & Sons, Inc.                 30
Sowinski Farms, Inc.-Cert. Seed          28
Kent Farms                               27
Schroeder Farms, Ltd                     26
Ted Baginski and Sons, Inc.              25
Rine Ridge Farms, Inc.                   25
Phytocultures Ltd.                       22
Jorde, Jim                      

In [17]:
# Three source growers have 2 source grower code

gcode_count = df[["S_GRW","S_GCODE"]].groupby("S_GRW").apply(lambda x: x["S_GCODE"].nunique())
gcode_count[gcode_count > 1]



S_GRW
CSS                         2
Jorde Certified Seed LLC    2
Thompson Seed Potatoes      2
dtype: int64

In [18]:
# Look into these three problematic grower
prob_grower = gcode_count[gcode_count > 1].index.tolist()
prob_grower

for grower in prob_grower:
    print(str(grower) +":" + str(df[df["S_GRW"] == grower]["S_GCODE"].unique()))
    print()
# prob_grw = gcode_count[gcode_count > 1].index

CSS:['co26' 'ne06']

Jorde Certified Seed LLC:['nd14' nan 'nd44']

Thompson Seed Potatoes:['ne07' 'ne20']



#### Possible redundant info for summer and winter

Summer info and winter info should match for source info

In [19]:
summer_columns = ["CERT_N",
"SNAME",
"GCODE",
"VARIETY",
"S_GRW",
"S_G",
"S_YR",
"S_GCODE",
"S_STATE"]

winter_columns = ["winter_{}".format(x) for x in summer_columns]
winter_columns

combined_columns =[]
for i in range(len(summer_columns)):
    combined_columns.append(summer_columns[i])
    combined_columns.append(winter_columns[i])
    
combined_columns

['CERT_N',
 'winter_CERT_N',
 'SNAME',
 'winter_SNAME',
 'GCODE',
 'winter_GCODE',
 'VARIETY',
 'winter_VARIETY',
 'S_GRW',
 'winter_S_GRW',
 'S_G',
 'winter_S_G',
 'S_YR',
 'winter_S_YR',
 'S_GCODE',
 'winter_S_GCODE',
 'S_STATE',
 'winter_S_STATE']

In [20]:
df[combined_columns]

Unnamed: 0,CERT_N,winter_CERT_N,SNAME,winter_SNAME,GCODE,winter_GCODE,VARIETY,winter_VARIETY,S_GRW,winter_S_GRW,S_G,winter_S_G,S_YR,winter_S_YR,S_GCODE,winter_S_GCODE,S_STATE,winter_S_STATE
0,107150,107150.0,Bula Po,Bula Po,114,114.0,1975-11-15 00:00:00,1975-11-15 00:00:00,"Summit Plant Labs, Inc.","Summit Plant Labs, Inc.",Summit Labs,Summit Labs,2007,2007.0,co02,co02,CO,CO
1,103207,103207.0,Bula Po,Bula Po,114,114.0,A-88338-1,A-88338-1,Kamps Seed Farm,Kamps Seed Farm,Manhattan,Manhattan,2003,2003.0,mt14,mt14,MT,MT
2,103217,103217.0,Bula Po,Bula Po,114,114.0,A-90586-11,A-90586-11,Kamps Seed Farm,Kamps Seed Farm,Manhattan,Manhattan,2003,2003.0,mt14,mt14,MT,MT
3,109003,109003.0,Guen Farm,Guen Farm,149,149.0,A-91814-5,A-91814-5,State Farm,State Farm,State Farm,State Farm,2009,2009.0,0,0,WI,WI
4,109003,109003.0,Guen Farm,Guen Farm,149,149.0,A-91814-5,A-91814-5,State Farm,State Farm,State Farm,State Farm,2009,2009.0,0,0,WI,WI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7497,110462,110462.0,Northsand,Northsand,196,196.0,Zebra,Zebra,"CETS, LLC Astrotuber","CETS, LLC Astrotuber",CETS,CETS,2008,2008.0,wi,wi,WI,WI
7498,108292,108292.0,Bula Po,Bula Po,114,114.0,Zebra,Zebra,"CETS, LLC Astrotuber","CETS, LLC Astrotuber",CETS,CETS,2008,2008.0,wi,wi,WI,WI
7499,108369,108369.0,Northsand,Northsand,196,196.0,Zebra,Zebra,"CETS, LLC Astrotuber","CETS, LLC Astrotuber",CETS,CETS,2008,2008.0,wi,wi,WI,WI
7500,108291,108291.0,Bula Po,Bula Po,114,114.0,Zeus,Zeus,"CETS, LLC Astrotuber","CETS, LLC Astrotuber",CETS,CETS,2008,2008.0,wi,wi,WI,WI


##### Check the reason for mismatch

Some of mismatch result from missing value

Solution: fill the missing value by its equivalent

In [21]:
conditions = [(df.loc[:, combined_columns[i]] != df.loc[:,combined_columns[i+1]]) for i in range(0,len(combined_columns),2)]
conditions

df.loc[conditions[0] | conditions[1] | conditions[2] | conditions[3] | conditions[4] | conditions[5] | conditions[6] | conditions[7] | conditions[8],combined_columns
      ] 

Unnamed: 0,CERT_N,winter_CERT_N,SNAME,winter_SNAME,GCODE,winter_GCODE,VARIETY,winter_VARIETY,S_GRW,winter_S_GRW,S_G,winter_S_G,S_YR,winter_S_YR,S_GCODE,winter_S_GCODE,S_STATE,winter_S_STATE
10,110329,,Bula Po,,114,,A008-ITE,,UI/Teutonia,,UI/Teutonia,,2010,,id01,,ID,
11,115135,115135.0,Bula Po,Bula Po,114,114.0,A02507-2LB - Payette Ru,A02507-2LB,UI/Teutonia,UI/Teutonia,UI/Teutonia,UI/Teutonia,2015,2015.0,id01,id01,ID,ID
13,112348,112348.0,Bula Po,Bula Po,114,114.0,A95109-1RUS,A95409-1RUS,UI/Teutonia,UI/Teutonia,UI/Teutonia,UI/Teutonia,2010,2010.0,id01,id01,ID,ID
16,110333,,Bula Po,,114,,A95409-1RUS,,UI/Teutonia,,UI/Teutonia,,2010,,id01,,ID,
42,115254,115254.0,Bula Po,Bula Po,114,114.0,Actrice,Actrice,Phytocultures Ltd.,Phytocultures Ltd.,Phytocu,Phytocu,2015,2015.0,pei05,pei05,PEI,PE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7453,110223,,Vermont,,340,,Yukon Gold,,State Farm,,State Farm,,2009,,0,,WI,
7472,108351,108351.0,Fleischman,Fleischman,133,133.0,Yukon Gold,Yukon Gold,State Farm,State Farm,State Farm,State Farm,1072,2006.0,0,0,WI,WI
7473,108351,108351.0,Fleischman,Fleischman,133,133.0,Yukon Gold,Yukon Gold,State Farm,State Farm,State Farm,State Farm,1072,2006.0,0,0,WI,WI
7495,110452,,Northsand,,196,,Zebra,,"CETS, LLC Astrotuber",,CETS,,2010,,wi,,WI,


Fill na value in either summer or winter columns by the other one

In [22]:
for i in range(0, len(combined_columns),2):
    df[combined_columns[i]] = df[combined_columns[i]].fillna(df[combined_columns[i+1]])
    df[combined_columns[i]] = df[combined_columns[i]].mask(df[combined_columns[i]]==0).fillna(df[combined_columns[i+1]])

for i in range(1, len(combined_columns),2):
    df[combined_columns[i]] = df[combined_columns[i]].fillna(df[combined_columns[i-1]])
    df[combined_columns[i]] =df[combined_columns[i]].mask(df[combined_columns[i]]==0).fillna(df[combined_columns[i-1]])


##### Mismatch Analysis

- Reason1: Typo 
- Reason2: full name and shorthand notation in variety
- Reason3: Extra comma (source grower)
 

In [23]:
new_conditions = [(df.loc[:, combined_columns[i]] != df.loc[:,combined_columns[i+1]]) for i in range(0,len(combined_columns),2)]
new_conditions

df.loc[new_conditions[0] | new_conditions[1] | new_conditions[2] | new_conditions[3] | new_conditions[4] | new_conditions[5] | new_conditions[6] | new_conditions[7] | new_conditions[8],combined_columns
      ] 

Unnamed: 0,CERT_N,winter_CERT_N,SNAME,winter_SNAME,GCODE,winter_GCODE,VARIETY,winter_VARIETY,S_GRW,winter_S_GRW,S_G,winter_S_G,S_YR,winter_S_YR,S_GCODE,winter_S_GCODE,S_STATE,winter_S_STATE
11,115135,115135.0,Bula Po,Bula Po,114,114.0,A02507-2LB - Payette Ru,A02507-2LB,UI/Teutonia,UI/Teutonia,UI/Teutonia,UI/Teutonia,2015,2015.0,id01,id01,ID,ID
13,112348,112348.0,Bula Po,Bula Po,114,114.0,A95109-1RUS,A95409-1RUS,UI/Teutonia,UI/Teutonia,UI/Teutonia,UI/Teutonia,2010,2010.0,id01,id01,ID,ID
42,115254,115254.0,Bula Po,Bula Po,114,114.0,Actrice,Actrice,Phytocultures Ltd.,Phytocultures Ltd.,Phytocu,Phytocu,2015,2015.0,pei05,pei05,PEI,PE
127,104379,104379.0,Gall Dave,Gall Dave,145,145.0,All Blue,All Blue,Uihlein Farm,Uihlein Farm,Uihlein Fm,Uihlein Fm,2000,2002.0,ny01,ny01,NY,NY
206,103159,103159.0,Wild,Wild,235,235.0,Atlantic,Atlantic,"Sklarczyk Seed Farm, LLC",Sklarczyk,Sklarczyk,Sklarczyk,2003,2003.0,mi01,mi01,MI,MI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7306,103418,103418.0,Vermont,D. Perkins,340,340.0,Yukon Gold,Yukon Gold,State Farm,State Farm,State Farm,State Farm,2003,2003.0,0,0,WI,WI
7394,113057,113057.0,Bula Po,Bula Po,114,114.0,Yukon Gold,Yukon Gold,James Gallenberg,James Gallenberg,J Gallenberg,Gallenberg J,2013,2013.0,142,142,WI,WI
7453,110223,110223.0,Vermont,Vermont,340,340.0,Yukon Gold,Yukon Gold,State Farm,State Farm,State Farm,State Farm,2009,2009.0,,,WI,WI
7472,108351,108351.0,Fleischman,Fleischman,133,133.0,Yukon Gold,Yukon Gold,State Farm,State Farm,State Farm,State Farm,1072,2006.0,0,0,WI,WI


In [24]:
# Number of mismatch after filling missing value

for i in range(len(new_conditions)):
    print(combined_columns[i*2] + ":" + str(len(df.loc[new_conditions[i], combined_columns])))

CERT_N:0
SNAME:125
GCODE:1
VARIETY:109
S_GRW:71
S_G:59
S_YR:62
S_GCODE:32
S_STATE:11


In [25]:
df.loc[df["S_G"] != df["winter_S_G"], combined_columns]

Unnamed: 0,CERT_N,winter_CERT_N,SNAME,winter_SNAME,GCODE,winter_GCODE,VARIETY,winter_VARIETY,S_GRW,winter_S_GRW,S_G,winter_S_G,S_YR,winter_S_YR,S_GCODE,winter_S_GCODE,S_STATE,winter_S_STATE
339,107359,107359.0,Northsand,Northsand,196,196.0,Atlantic,Atlantic,"CETS, LLC Astrotuber","CETSTECH, LLC",CETS,Cets,2007,2007.0,wi,wi,WI,WI
460,111261,111261.0,Sowinski,Sowinski,269,269.0,Atlantic,Atlantic,Felix Zeloski Farms-Eagle River,State Farm,Zeloski -ER,State Farm,2010,2010.0,243,243,WI,WI
462,111261,111261.0,Sowinski,Sowinski,269,269.0,Atlantic,Atlantic,Felix Zeloski Farms-Eagle River,State Farm,Zeloski -ER,State Farm,2010,2010.0,243,243,WI,WI
584,113190,113190.0,Schroeder,Schroeder,216,216.0,Atlantic,Atlantic,"Schroeder Farms, Ltd",Schroder Farms Ltd.,Schroeder Farm,Schroder Farms,2013,2013.0,217,217,WI,WI
585,113190,113190.0,Schroeder,Schroeder,216,216.0,Atlantic,Atlantic,"Schroeder Farms, Ltd",Schroder Farms Ltd.,Schroeder Farm,Schroder Farms,2013,2013.0,217,217,WI,WI
586,113190,113190.0,Schroeder,Schroeder,216,216.0,Atlantic,Atlantic,"Schroeder Farms, Ltd",Schroder Farms Ltd.,Schroeder Farm,Schroder Farms,2013,2013.0,217,217,WI,WI
587,113190,113190.0,Schroeder,Schroeder,216,216.0,Atlantic,Atlantic,"Schroeder Farms, Ltd",Schroder Farms Ltd.,Schroeder Farm,Schroder Farms,2013,2013.0,217,217,WI,WI
588,113190,113190.0,Schroeder,Schroeder,216,216.0,Atlantic,Atlantic,"Schroeder Farms, Ltd",Schroder Farms Ltd.,Schroeder Farm,Schroder Farms,2013,2013.0,217,217,WI,WI
589,113190,113190.0,Schroeder,Schroeder,216,216.0,Atlantic,Atlantic,"Schroeder Farms, Ltd",Schroder Farms Ltd.,Schroeder Farm,Schroder Farms,2013,2013.0,217,217,WI,WI
590,113190,113190.0,Schroeder,Schroeder,216,216.0,Atlantic,Atlantic,"Schroeder Farms, Ltd",Schroder Farms Ltd.,Schroeder Farm,Schroder Farms,2013,2013.0,217,217,WI,WI


In [26]:
df["S_G"].value_counts()

State Farm        5172
UW Breeding        511
Uihlein Fm         225
Zeloski -ER        207
Val TCulture       133
Summit Labs        119
Sklarczyk          103
CETS               102
CSS                 97
Schroeder Bros      90
Droge               80
John Miller         39
Wild                34
Seidl               32
Mattek              30
Sowinski            28
Kent Farms          27
J Jorde             27
Schroeder Farm      26
Rine Ridge          25
Baginski            25
Phytocu             23
Schutter            16
Thompson            16
Gallenberg Fms      16
H Miller            15
Wirz                15
Kakes               14
J Gallenberg        13
Sunny Valley        13
UI/Teutonia         11
Hafner              11
Eagle River Se      11
Fleischman D         9
Summit Farms         8
Sunnydale            8
Jorde Mike           7
Skogman              7
Bula Potato          7
Childstock           6
Droge Farms          6
Johnson              6
Sunrain Variet       6
Tetonia    

Fix the problem of S_G:

- Upper case and lower case (CETS and Cets) Cets => CETS
- Plural and singular forms (Nilson Farms & Nilson Farm)
- Spelling error (Schroeder Farm, Schroder Farms) Schroder Farms => Schroeder Farm
- Order (J Gallenberg, Gallenberg J)

In [27]:
df["winter_S_G"] = df["winter_S_G"].apply(lambda x: "CETS" if x == "Cets" else x)

df["winter_S_G"] = df["winter_S_G"].apply(lambda x: "Schroeder Farm" if x == "Schroder Farms" else x)

df["winter_S_G"] = df["winter_S_G"].apply(lambda x: "J Gallenberg" if x == "Gallenberg J" else x)


In [28]:
df.loc[df["S_G"] != df["winter_S_G"], combined_columns]

Unnamed: 0,CERT_N,winter_CERT_N,SNAME,winter_SNAME,GCODE,winter_GCODE,VARIETY,winter_VARIETY,S_GRW,winter_S_GRW,S_G,winter_S_G,S_YR,winter_S_YR,S_GCODE,winter_S_GCODE,S_STATE,winter_S_STATE
460,111261,111261.0,Sowinski,Sowinski,269,269.0,Atlantic,Atlantic,Felix Zeloski Farms-Eagle River,State Farm,Zeloski -ER,State Farm,2010,2010.0,243,243,WI,WI
462,111261,111261.0,Sowinski,Sowinski,269,269.0,Atlantic,Atlantic,Felix Zeloski Farms-Eagle River,State Farm,Zeloski -ER,State Farm,2010,2010.0,243,243,WI,WI
996,106004,106004.0,Kakes,Kakes,162,162.0,Dakota Crisp,Dakota Crisp,Nilson Farms,Nilson Farms,Nilson Farms,Nilson Farm,2005,2005.0,nd39,nd35,ND,ND
1115,104119,104119.0,Mattek,Mattek,190,190.0,Dark Red Norland,Dark Red Norland,"Ted Baginski and Sons, Inc.","Ted Baginski and Sons, Inc.",Baginski,T Baginski,2004,2004.0,108,108,WI,WI
1116,104119,104119.0,Mattek,Mattek,190,190.0,Dark Red Norland,Dark Red Norland,"Ted Baginski and Sons, Inc.","Ted Baginski and Sons, Inc.",Baginski,T Baginski,2004,2004.0,108,108,WI,WI
1550,114228,114228.0,Gall Dave &,Gall Dave &,145,145.0,Dark Red Norland,Dark Red Norland,James Gallenberg,James Gallenberg,Gallenberg J,J Gallenberg,2013,2013.0,142,142,WI,WI
1701,106348,106348.0,Schroeder,Schroeder,216,216.0,Dark Red Norland,Dark Red Norland,Felix Zeloski Farms-Eagle River,State Farm,Zeloski -ER,State Farm,2004,2004.0,243,243,WI,WI
1702,106348,106348.0,Schroeder,Schroeder,216,216.0,Dark Red Norland,Dark Red Norland,Felix Zeloski Farms-Eagle River,State Farm,Zeloski -ER,State Farm,2004,2004.0,243,243,WI,WI
1932,103025,103025.0,Gall Farm,Gall Farm,144,144.0,German Butterball,German Butterball,David Gallenberg,David Gallenberg,Gallenberg D,D Gallenberg,2003,2003.0,145,145,WI,WI
2012,104374,104374.0,Hartman,Hartman,154,154.0,Goldrush,Goldrush,"Ted Baginski and Sons, Inc.","Ted Baginski and Sons, Inc.",Baginski,T Baginski,2004,2004.0,108,108,WI,WI


Fix the problem of S_STATE: 

- change PEI to PE

In [29]:
df["S_STATE"] = df["S_STATE"].apply(lambda x:"PE" if x == "PEI" else x)
df["S_STATE"]

0       CO
1       MT
2       MT
3       WI
4       WI
        ..
7497    WI
7498    WI
7499    WI
7500    WI
7501    WI
Name: S_STATE, Length: 7502, dtype: object

Fix the problem of S_YR: 

- change 1072 (Typo) to 2006

In [30]:
df["S_YR"] = df["S_YR"].apply(lambda x:2006 if x == 1072 else x)
df["S_YR"]

0       2007
1       2003
2       2003
3       2009
4       2009
        ... 
7497    2008
7498    2008
7499    2008
7500    2008
7501    2008
Name: S_YR, Length: 7502, dtype: int64

In [31]:
df.loc[df["S_YR"] != df["winter_S_YR"], combined_columns]

Unnamed: 0,CERT_N,winter_CERT_N,SNAME,winter_SNAME,GCODE,winter_GCODE,VARIETY,winter_VARIETY,S_GRW,winter_S_GRW,S_G,winter_S_G,S_YR,winter_S_YR,S_GCODE,winter_S_GCODE,S_STATE,winter_S_STATE
127,104379,104379.0,Gall Dave,Gall Dave,145,145.0,All Blue,All Blue,Uihlein Farm,Uihlein Farm,Uihlein Fm,Uihlein Fm,2000,2002.0,ny01,ny01,NY,NY
405,111009,111009.0,Hafner,State Farm,151,151.0,Atlantic,Dark Red Norland Z2,State Farm,State Farm,State Farm,State Farm,2010,2009.0,0,0,WI,WI
407,111002,111002.0,Hafner,State Farm,151,151.0,Atlantic,Nicolet,State Farm,State Farm,State Farm,State Farm,2011,2008.0,0,0,WI,WI
489,111041,111041.0,Wirz,State Farm,236,236.0,Atlantic,Snowden,State Farm,State Farm,State Farm,State Farm,2011,2010.0,0,0,WI,WI
490,111041,111041.0,Wirz,State Farm,236,236.0,Atlantic,Snowden,State Farm,State Farm,State Farm,State Farm,2011,2010.0,0,0,WI,WI
514,112798,112798.0,Northsand,Northsand,196,196.0,Atlantic,Atlantic,State Farm,State Farm,State Farm,State Farm,2011,2010.0,0,0,WI,WI
515,112798,112798.0,Northsand,Northsand,196,196.0,Atlantic,Atlantic,State Farm,State Farm,State Farm,State Farm,2011,2010.0,0,0,WI,WI
2206,111127,111127.0,Gall Farm,Gall Farm,144,144.0,Goldrush,Goldrush,State Farm,State Farm,State Farm,State Farm,2009,2010.0,0,0,WI,WI
2212,111012,111012.0,Hafner,State Farm,151,151.0,Goldrush,Langlade,State Farm,State Farm,State Farm,State Farm,2010,2009.0,0,0,WI,WI
2214,111004,111004.0,Hafner,State Farm,151,151.0,Goldrush,Red Pontiac,State Farm,State Farm,State Farm,State Farm,2011,2008.0,0,0,WI,WI


In [32]:
df.loc[df["winter_S_STATE"] == "PE", combined_columns]

Unnamed: 0,CERT_N,winter_CERT_N,SNAME,winter_SNAME,GCODE,winter_GCODE,VARIETY,winter_VARIETY,S_GRW,winter_S_GRW,S_G,winter_S_G,S_YR,winter_S_YR,S_GCODE,winter_S_GCODE,S_STATE,winter_S_STATE
42,115254,115254.0,Bula Po,Bula Po,114,114.0,Actrice,Actrice,Phytocultures Ltd.,Phytocultures Ltd.,Phytocu,Phytocu,2015,2015.0,pei05,pei05,PE,PE
92,105291,105291.0,Bula Po,Bula Po,114,114.0,Adora,Adora,PEI Produce,PEI Produce,PEI Produce,PEI Produce,2004,2004.0,pei2,pei2,PE,PE
93,105297,105297.0,Bula Po,Bula Po,114,114.0,Adora,Adora,Phytocultures Ltd.,Phytocultures Ltd.,Phytocu,Phytocu,2005,2005.0,pei05,pei05,PE,PE
94,104021,104021.0,Bula Po,Bula Po,114,114.0,Adora,Adora,PEI Produce,PEI Produce,PEI Produce,PEI Produce,2004,2004.0,pei2,pei2,PE,PE
1034,112195,112195.0,Schroeder,Schroeder,216,216.0,Dark Red Chieftain,Dark Red Chieftain,Martin Visser and Sons,Martin Visser and Sons,Martin,Martin,2012,2012.0,pei10,pei10,PE,PE
1792,105290,105290.0,Bula Po,Bula Po,114,114.0,Fabula,Fabula,Phytocultures Ltd.,Phytocultures Ltd.,Phytocu,Phytocu,2004,2004.0,pei05,pei05,PE,PE
1793,105298,105298.0,Bula Po,Bula Po,114,114.0,Fabula,Fabula,Phytocultures Ltd.,Phytocultures Ltd.,Phytocu,Phytocu,2005,2005.0,pei05,pei05,PE,PE
1794,107123,107123.0,Bula Po,Bula Po,114,114.0,Fabula,Fabula,Phytocultures Ltd.,Phytocultures Ltd.,Phytocu,Phytocu,2005,2005.0,pei05,pei05,PE,PE
1795,107123,107123.0,Bula Po,Bula Po,114,114.0,Fabula,Fabula,Phytocultures Ltd.,Phytocultures Ltd.,Phytocu,Phytocu,2005,2005.0,pei05,pei05,PE,PE
1796,109225,109225.0,Bula Po,Bula Po,114,114.0,Fabula,Fabula,Phytocultures Ltd.,Phytocultures Ltd.,Phytocu,Phytocu,2008,2008.0,pei05,pei05,PE,PE


### Data Visualization

#### Virus after each inspection

In [33]:
SR1 = df.columns[df.columns.str.contains("SR1")].tolist()
SR2 = df.columns[df.columns.str.contains("SR2")].tolist()

SR1, SR2

(['SR1_LR', 'SR1_MOS', 'SR1_ST', 'SR1_TOTV', 'SR1_MIX'],
 ['SR2_LR', 'SR2_MOS', 'SR2_ST', 'SR2_TOTV', 'SR2_BRR', 'SR2_MIX'])

In [34]:
df.loc[:, (df.columns.str.contains("SR2")) | (df.columns.str.contains("SR1")) ]

Unnamed: 0,SR1_LR,SR1_MOS,SR1_ST,SR1_TOTV,SR1_MIX,SR2_LR,SR2_MOS,SR2_ST,SR2_TOTV,SR2_BRR,SR2_MIX
0,0.0,0.00,0.0,0.00,0.00000,0.0,0.00,0.0,0.00,0.0,0.00000
1,0.0,0.00,0.0,0.00,0.20202,0.0,0.00,0.0,0.00,0.0,0.00000
2,0.0,0.44,0.0,0.44,0.00000,0.0,0.44,0.0,0.44,0.0,0.03788
3,0.0,0.00,0.0,0.00,0.00000,0.0,0.00,0.0,0.00,0.0,0.00000
4,0.0,0.00,0.0,0.00,0.00000,0.0,0.00,0.0,0.00,0.0,0.00000
...,...,...,...,...,...,...,...,...,...,...,...
7497,0.0,0.00,0.0,0.00,0.00000,0.0,0.00,0.0,0.00,0.0,0.00000
7498,0.0,0.00,0.0,0.00,0.00000,0.0,0.00,0.0,0.00,0.0,0.00000
7499,0.0,0.00,0.0,0.00,0.00000,0.0,0.00,0.0,0.00,0.0,0.00000
7500,0.0,0.00,0.0,0.00,0.00000,0.0,0.00,0.0,0.00,0.0,0.00000


#### Relationship between source and virus

Group by categories:

- S_GRW
- S_YR
- S_STATE
- Variety


Virus Type
- LR (Leaf Roll Virus)
- MOS (Mosaic Virus)
- ST (Spindle Tuber viroid)
- MIX (Foreign Variety)

Disease type

In [35]:
df.columns[df.columns.str.contains("ST")]

Index(['S_STATE', 'START_PLTG', 'DATE_1ST', 'NO_LR_1ST', 'NO_MOS_1ST',
       'NO_ST_1ST', 'SR1_ST', 'NO_MIX_1ST', 'AC_MIX_1ST', 'SR2_ST', 'SRF_ST',
       'ASTRYELOS', 'STAND', 'PSTV_F', 'LAST_MOD', 'LAST_TIM', 'STOP', 'STATE',
       'winter_S_STATE', 'winter_PSTV'],
      dtype='object')

In [36]:
# LR = ['SR1_LR', 'SR2_LR', 'winter_LR']
# MOS = ['SR1_MOS', 'SR2_MOS', 'winter_MOS']

In [37]:
import re
word = 'SR_LR'
regexp = re.compile(r'[SR1|SR2|winter]_LR$')
if regexp.search(word):
      print (word)

SR_LR


In [38]:
def find_virus_columns(virus):
    return [x for x in df.columns.tolist() if re.compile(r'[SR1|SR2|winter]_P*{virus}V*$'.format(virus = virus)).search(x)]
    
find_virus_columns("LR")

['SR1_LR', 'SR2_LR', 'winter_LR']

In [39]:
find_virus_columns("MOS")

['SR1_MOS', 'SR2_MOS', 'winter_MOS']

In [40]:
find_virus_columns("ST")

['SR1_ST', 'SR2_ST', 'winter_PSTV']

In [41]:
find_virus_columns("MIX")

['SR1_MIX', 'SR2_MIX', 'winter_MIX']

In [42]:
a = list(np.sort(df["S_YR"].unique()))
a.append("all")
a

NameError: name 'np' is not defined

In [None]:
virus_columns = find_virus_columns("LR")
frequent_state = df["S_STATE"].value_counts()[:8].index.to_list()
temp = df[df["S_STATE"].isin(frequent_state)].groupby("S_STATE").mean()[virus_columns]

In [None]:
    
def plot_virus_by_state(state_number, virus, year = "all"):
    virus_columns = find_virus_columns(virus)
    if year == "all":
        temp = df  
    else:  
        temp = df[df["S_YR"] == year]
    frequent_state = temp["S_STATE"].value_counts()[:state_number].index.to_list()
    temp = temp[temp["S_STATE"].isin(frequent_state)].groupby("S_STATE").mean()[virus_columns]
    fig = go.Figure()
    fig.add_trace(go.Bar(x=temp.index,
                    y = temp.iloc[:,0],
                    text=np.round(temp.iloc[:,0],3),
                    textposition='outside',
                    name=temp.columns[0],
                    marker_color='rgb(55, 83, 109)'
                    ))
    fig.add_trace(go.Bar(x=temp.index,
                    y = temp.iloc[:,1],
                    text=np.round(temp.iloc[:,1],3),
                    textposition='outside',
                    name= temp.columns[1],
                    marker_color='rgb(26, 118, 255)'
                    ))

    fig.add_trace(go.Bar(x=temp.index,
                    y = temp.iloc[:,2],
                    text=np.round(temp.iloc[:,2],3),
                    textposition='outside',
                    name=temp.columns[2],
                    marker_color='crimson'
                    ))

    fig.update_layout(
        title='US Export of Plastic Scrap',
        xaxis_tickfont_size=14,
        yaxis=dict(
            title='USD (millions)',
            titlefont_size=16,
            tickfont_size=14,
        ),
        legend=dict(
            x=0,
            y=1.0,
            bgcolor='rgba(255, 255, 255, 0)',
            bordercolor='rgba(255, 255, 255, 0)'
        ),
        barmode='group',
        bargap=0.15, # gap between bars of adjacent location coordinates.
        bargroupgap=0.1 # gap between bars of the same location coordinate.
    )
    fig.show()

In [None]:
plot_virus_by_state(10, "LR", "all")

In [None]:
def plot_virus_inspection(virus):
    fig, ax = plt.subplots(figsize = (10,6))
    virus_columns = find_virus_columns(virus)
    df[df["S_STATE"].isin(frequent_state)].groupby("S_STATE").mean()[virus_columns].plot(kind = "barh", ax = ax)
    ax.tick_params(axis='x', rotation=0)
    ax.set_title("Virus across state")
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.tight_layout()


plot_virus_inspection("MOS")

In [None]:
frequent_state = df["S_STATE"].value_counts()[:8].index.to_list()
frequent_state

In [None]:
df.columns[df.columns.str.contains("SR1")]

In [None]:
target_virus = ['SR1_ST','SR1_MIX',"SR1_LR"]

In [None]:
fig, ax = plt.subplots(figsize = (10,6))
df[df["S_STATE"].isin(frequent_state)].groupby("S_STATE").mean()[target_virus].plot(kind = "barh", ax = ax)
ax.tick_params(axis='x', rotation=0)
ax.set_title("Virus across state")
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.tight_layout()

plt.savefig("Virus_across_state")

In [None]:
frequent_variety = df["VARIETY"].value_counts()[:15].index.tolist()
frequent_variety

In [None]:
def plot_virus_by_variety(variety_number, virus, year = "all"):
    virus_columns = find_virus_columns(virus)
    if year == "all":
        temp = df  
    else:  
        temp = df[df["S_YR"] == year]
    
   
    
    frequent_variety = temp["VARIETY"].value_counts()[:variety_number].index.tolist()
    temp = temp[temp["VARIETY"].isin(frequent_variety)].groupby("VARIETY").mean()[virus_columns]
    fig = go.Figure()
    fig.add_trace(go.Bar(y=temp.index,
                    x = temp.iloc[:,0],
                    text=np.round(temp.iloc[:,0],3),
                    textposition='outside',
                    name=temp.columns[0],
                    marker_color='rgb(55, 83, 109)',
                    orientation='h'
                    ))
    fig.add_trace(go.Bar(y=temp.index,
                    x = temp.iloc[:,1],
                    text=np.round(temp.iloc[:,1],3),
                    textposition='outside',
                    name= temp.columns[1],
                    marker_color='rgb(26, 118, 255)',
                    orientation='h'
                    ))

    fig.add_trace(go.Bar(y=temp.index,
                    x = temp.iloc[:,2],
                    text=np.round(temp.iloc[:,2],3),
                    textposition='outside',
                    name=temp.columns[2],
                    marker_color='crimson',
                    orientation='h'
                    ))

    fig.update_layout(
        title='US Export of Plastic Scrap',
        xaxis_tickfont_size=14,
        yaxis=dict(
            title='USD (millions)',
            titlefont_size=16,
            tickfont_size=14,
        ),
        legend=dict(
            y=0,
            x=1.0,
            bgcolor='rgba(255, 255, 255, 0)',
            bordercolor='rgba(255, 255, 255, 0)'
        ),
        barmode='group',
        bargap=0.15, # gap between bars of adjacent location coordinates.
        bargroupgap=0.1 # gap between bars of the same location coordinate.
    )
    fig.show()

In [None]:
virus_columns = find_virus_columns("LR")
# if year == "all":
#     temp = df  
# else:  
#     temp = df[df["S_YR"] == year]


temp = df
frequent_variety = temp["VARIETY"].value_counts()[:15].index.tolist()
temp = temp[temp["VARIETY"].isin(frequent_variety)].groupby("VARIETY").mean()[virus_columns]
temp

In [None]:
plot_virus_by_variety(10, "LR", "all")

In [None]:
df["S_G"]

In [None]:
def plot_virus_by_variety(variety_number, virus, year = "all"):
    virus_columns = find_virus_columns(virus)
    if year == "all":
        temp = df  
    else:  
        temp = df[df["S_YR"] == year]
    
   
    
    frequent_variety = temp["S_G"].value_counts()[:variety_number].index.tolist()
    temp = temp[temp["S_G"].isin(frequent_variety)].groupby("S_G").mean()[virus_columns]
    fig = go.Figure()
    fig.add_trace(go.Bar(y=temp.index,
                    x = temp.iloc[:,0],
                    text=np.round(temp.iloc[:,0],3),
                    textposition='outside',
                    name=temp.columns[0],
                    marker_color='rgb(55, 83, 109)',
                    orientation='h'
                    ))
    fig.add_trace(go.Bar(y=temp.index,
                    x = temp.iloc[:,1],
                    text=np.round(temp.iloc[:,1],3),
                    textposition='outside',
                    name= temp.columns[1],
                    marker_color='rgb(26, 118, 255)',
                    orientation='h'
                    ))

    fig.add_trace(go.Bar(y=temp.index,
                    x = temp.iloc[:,2],
                    text=np.round(temp.iloc[:,2],3),
                    textposition='outside',
                    name=temp.columns[2],
                    marker_color='crimson',
                    orientation='h'
                    ))

    fig.update_layout(
        title='US Export of Plastic Scrap',
        xaxis_tickfont_size=14,
        yaxis=dict(
            title='USD (millions)',
            titlefont_size=16,
            tickfont_size=14,
        ),
        legend=dict(
            y=0,
            x=1.0,
            bgcolor='rgba(255, 255, 255, 0)',
            bordercolor='rgba(255, 255, 255, 0)'
        ),
        barmode='group',
        bargap=0.15, # gap between bars of adjacent location coordinates.
        bargroupgap=0.1 # gap between bars of the same location coordinate.
    )
    fig.show()

In [None]:
# fig, ax = plt.subplots(figsize = (10,6))
# df[df["VARIETY"].isin(frequent_variety)].groupby("VARIETY").mean()[find_virus_columns("LR")].plot(kind = "barh", ax = ax)
# ax.tick_params(axis='x', rotation=0)
# ax.tick_params(axis="y", rotation = 0)
# ax.set_title("Virus across potato variety")
# ax.spines['top'].set_visible(False)
# ax.spines['right'].set_visible(False)
# plt.tight_layout()
# plt.savefig("Virus_across_potato_variety")