# Imports 

In [137]:
## packages
import pandas as pd
import numpy as np
import os
from langdetect import detect, detect_langs

## define constants
GITHUB_DATA_PATH = "../data/raw_data/"

## repeated printouts
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Read in addendums data and combine

In [138]:
## read in multi-sheet excel file
all_foia_sheets = pd.read_excel("FOIA_2021-F-05932_raw_data.xlsx",
                               sheet_name= None)

print("There are %s FOIA sheets"% len(all_foia_sheets))

There are 2 FOIA sheets


In [139]:
## stored as dictionary with sheets as different keys; extract to inspect
fy20_addendums = all_foia_sheets['FOIA_2021-F-05932_FY2020']
fy21_q1_addendums = all_foia_sheets['FOIA_2021-F-05932_FY2021_Q1']

In [140]:
## confirm same columns
cols_20_not21 = set(fy20_addendums.columns).difference(fy21_q1_addendums.columns)
cols_21_not20 = set(fy21_q1_addendums.columns).difference(fy20_addendums.columns)

assert len(cols_20_not21) == 0
assert len(cols_21_not20) == 0 

## rowbind original dictionaries, drop fully duplicated rows, and remove index cols
addendum_init = pd.concat(all_foia_sheets).reset_index()
addendum_init2 = addendum_init.drop_duplicates(["CASE_NUMBER", "SECTION_DETAILS"]).copy()
addendum_init2['is_missing_all'] = (addendum_init2.SECTION_NAME.isnull()) & \
                                (addendum_init2.SECTION_NUMBER.isnull()) & \
                                (addendum_init2.SECTION_DETAILS.isnull())

## create filtered addendum df as: 
## (1) drop rows missing all details (later we'll do a left join to all disclosures) and 
## (2) remove levels columns from reset_index()
addendum = addendum_init2.loc[~addendum_init2.is_missing_all,
                             [col for col in addendum_init2.columns
                             if "levels" not in col]]

print("There are %s rows and %s unique job ids"% (addendum.shape[0], 
                                            len(addendum.CASE_NUMBER.unique())))

There are 99615 rows and 13530 unique job ids


# Clean fields

For now, leave unit of analysis as job-section dyad rather than concatenating / filtering to top sections at this phase

## Cleaning section name and section number

In [141]:

## explore relationship between section # and section name
addendum.SECTION_NAME.value_counts().head()
addendum.SECTION_NUMBER.value_counts().head()


Job Duties - null                         3596
Job Requirements                          2357
Job Duties                                1937
Job Duties - undefined                    1734
Inbound/Outbound Transportation - null    1641
Name: SECTION_NAME, dtype: int64

A.8a    35180
B.6     30519
F.2     12266
A.11    10285
F.1      4911
Name: SECTION_NUMBER, dtype: int64

In [142]:
## group by section # and see different section names
addendum.groupby('SECTION_NUMBER').agg({'SECTION_NAME': lambda x: "; ".join(sorted(x.astype(str).unique()))})

## later: use this to clean up section numbers and names --- eg B3 are different variants of drug screening

Unnamed: 0_level_0,SECTION_NAME
SECTION_NUMBER,Unnamed: 1_level_1
1,Contract Impossibility
2,Workers Compensation
3,Job Specifications
4,Transportation and Daily Subsistence
5,Transportation and Daily Subsistence continue..
6,Hours and Earnings Statements
16,JOB DESCRIPTION/REQUIREMENTS AND WORK RULES
-,- Continued from Section C
3.a.,Family Housing
8.a.,Job Duties - Production Standards


## Detecting Spanish language to figure out which ones to translate

Previous script used a manual list of Spanish keywords

Here, we should:
    
- First detect ones with any Spanish
- Use Google Cloud API to translate those

Started on step 1 with random sample and Eunice can pick up on step 2

In [193]:
sample_add = addendum.sample(n = 200, random_state = 91988)

## example true positive in spanish:
## CASE_NUMBER: H-300-20063-372516

## test language detection code on a couple examples
examples = sample_add.loc[sample_add.CASE_NUMBER.isin(["H-300-20063-372516",
                                                     "H-300-19316-139384"])].copy()
examples

### for eunice, not sure if robust enough 
### to deal with multiple languages in same part
### of text so might want to generalize
def detect_onestr(one_str):
    
    ## return list
    res = detect_langs(one_str)
    
    ## transform into a string and split on :
    split_res = str(res[0]).split(":")
    
    ## return split
    return(split_res)

## add language and probabilities to dataframe
examples['lang'] = [detect_onestr(one_str)[0] for one_str in examples.SECTION_DETAILS]

examples['lang_prob'] = [detect_onestr(one_str)[1] for one_str in examples.SECTION_DETAILS]

Unnamed: 0,level_0,level_1,CASE_NUMBER,SECTION_NAME,SECTION_NUMBER,SECTION_DETAILS,is_missing_all
4667,FOIA_2021-F-05932_FY2020,4667,H-300-19316-139384,Job Requirements,B.6,Housing and utilities are provided at no cost ...,False
53418,FOIA_2021-F-05932_FY2020,53418,H-300-20063-372516,Job Duties - Job Duties Spanish Version,A.8a,"Cosecha de Sandia a mano, el trabajador camina...",False


## Pseudo String Testing (More than One Language)

In [144]:
from langdetect import detect, DetectorFactory, detect_langs
import re

DetectorFactory.seed = 42

## first test the following strings has more than one language and 
## involving cases where there are 1-3 languages detected
detect_langs("hola, buenos dias! how are you")
detect_langs("muy bien. y tu? tengo hambre. i am good")
detect_langs("hola")

def detect_onestr(one_str):
    
    ## return list
    res = detect_langs(one_str)
    
    ## if the string has more than one language present
    if len(res)>1:
        split_res = str(res).split(":")
        split_res = str(split_res).split(",")
    
    else:
        ## transform into a string and split on :
        split_res = str(res).split(":")

    ## return split
    return(split_res)

## create a pseudo dataframe with the above three strings
data = {'String': ['hola, buenos dias! how are you', 'muy bien. y tu? tengo hambre. i am good', "hola"]}  
df = pd.DataFrame(data)  

## Run and clean the column (to avoid bracket and '' )
## seems to work well...
df['all']= [detect_onestr(one_str) for one_str in df.String]
df["lang1"] = df["all"].str[0].str.replace('[^\w\s]','')
df["lang1_prob"] = df["all"].str[1].str.replace('[^0-9,.]', '')
df["lang2"] = df["all"].str[2].str.replace('[^\w\s]','')
df["lang2_prob"] = df["all"].str[3].str.replace('[^0-9,.]', '')
df["lang3"] = df["all"].str[4].str.replace('[^\w\s]','')
df["lang3_prob"] = df["all"].str[5].str.replace('[^0-9,.]', '')
df



[es:0.8571420716089807, en:0.14285752156310932]

[es:0.42857157935666595, cy:0.4285702924315681, tl:0.1428566075204015]

[cy:0.9999943872975154]

  df["lang1"] = df["all"].str[0].str.replace('[^\w\s]','')
  df["lang1_prob"] = df["all"].str[1].str.replace('[^0-9,.]', '')
  df["lang2"] = df["all"].str[2].str.replace('[^\w\s]','')
  df["lang2_prob"] = df["all"].str[3].str.replace('[^0-9,.]', '')
  df["lang3"] = df["all"].str[4].str.replace('[^\w\s]','')
  df["lang3_prob"] = df["all"].str[5].str.replace('[^0-9,.]', '')


Unnamed: 0,String,all,lang1,lang1_prob,lang2,lang2_prob,lang3,lang3_prob
0,"hola, buenos dias! how are you","[['[es', '0.8571420716089807, en', '0.14285...",es,0.8571420716089807,en,0.1428575215631093,,
1,muy bien. y tu? tengo hambre. i am good,"[['[es', '0.42857157935666595, cy', '0.4285...",es,0.4285715793566659,cy,0.4285702924315681,tl,0.1428566075204015
2,hola,"[[cy, 0.9999943872975154]]",cy,0.9999943872975154,,,,


## Clean NaN out from Section_Details As Can't Run detect_langs on those

In [228]:
## Cant run detect_langs on NaN rows of SECTION_DETAILS 

## Examine the NaN rows
print(addendum[addendum['SECTION_DETAILS'].isnull()])
# 11 NaN rows
len(addendum[addendum['SECTION_DETAILS'].isnull()].index)
# 99615 rows originally with NaN
len(addendum.index)

## filter to non NaN one
addendum_wdetails = addendum[addendum['SECTION_DETAILS'].notna()]
# check the number of rows 99604 (correct!)
len(addendum_wdetails.index)

                        level_0  level_1         CASE_NUMBER  \
30371  FOIA_2021-F-05932_FY2020    30371  H-300-20021-266722   
30720  FOIA_2021-F-05932_FY2020    30720  H-300-20028-281642   
33855  FOIA_2021-F-05932_FY2020    33855  H-300-20031-289056   
34922  FOIA_2021-F-05932_FY2020    34922  H-300-20034-292488   
43789  FOIA_2021-F-05932_FY2020    43789  H-300-20038-305174   
53686  FOIA_2021-F-05932_FY2020    53686  H-300-20084-429274   
54044  FOIA_2021-F-05932_FY2020    54044  H-300-20085-434826   
58928  FOIA_2021-F-05932_FY2020    58928  H-300-20107-492731   
76987  FOIA_2021-F-05932_FY2020    76987  H-300-20184-692496   
81901  FOIA_2021-F-05932_FY2020    81901  H-300-20197-715946   
85390  FOIA_2021-F-05932_FY2020    85390  H-300-20247-804602   

                                            SECTION_NAME SECTION_NUMBER  \
30371  Inbound/Outbound Transportation - Inbound/Outb...            F.2   
30720  Job Requirements - Additional Information Rega...            B.6   
33855 

11

99615

99604

## Applied the detect_langs to all the job posting
- incorporate except because detect_langs wont run on strings that do not have features


In [230]:
def detect_onestr(one_str):
    
    try:
        ## if the string has more than one language present
        ## return list
        res = detect_langs(one_str)
        if len(res)>1:
            split_res = str(res).split(":")
            split_res = str(split_res).split(",")

        else:
            ## transform into a string and split on :
            split_res = str(res).split(":")
    except:
        split_res="notdetectable"
    ## return split
    return(split_res)

## Take me about 10 minutes to run
addendum_wdetails['all']= [detect_onestr(one_str) for one_str in addendum_wdetails.SECTION_DETAILS]




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  addendum_wdetails['all']= [detect_onestr(one_str) for one_str in addendum_wdetails.SECTION_DETAILS]


## Remove notdectable job postings

In [252]:
## 66 rows' sectiondetails are not detectable
addendum_wdetails['all']=addendum_wdetails['all'].astype("string")
addendum_wdetails.loc[addendum_wdetails['all'] == "notdetectable"]
## filter out the notdetectable one
addendum_wdetails_cleaned=addendum_wdetails.loc[addendum_wdetails['all'] != "notdetectable"]
## Check the rows (99604-66=99538)
len(addendum_wdetails_cleaned.index)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  addendum_wdetails['all']=addendum_wdetails['all'].astype("string")


Unnamed: 0,level_0,level_1,CASE_NUMBER,SECTION_NAME,SECTION_NUMBER,SECTION_DETAILS,is_missing_all,all
8925,FOIA_2021-F-05932_FY2020,8925,H-300-19361-219122,Job Requirements - NJ Housing Disclose,B.6,\n,False,notdetectable
28866,FOIA_2021-F-05932_FY2020,28866,H-300-20015-254586,Job Requirements - NJ Housing Disclose,B.6,\n,False,notdetectable
33821,FOIA_2021-F-05932_FY2020,33821,H-300-20031-288294,Pay Deductions - California Tax ID,A.11,51511293,False,notdetectable
34953,FOIA_2021-F-05932_FY2020,34953,H-300-20030-288157,Pay Deductions - California Tax ID,A.11,51511293,False,notdetectable
39537,FOIA_2021-F-05932_FY2020,39537,H-300-20042-312493,Pay Deductions - California Tax ID,A.11,515-50853,False,notdetectable
...,...,...,...,...,...,...,...,...
95317,FOIA_2021-F-05932_FY2021_Q1,9836,H-300-20303-892180,Pay Deductions - State Tax ID,A.11,013-6895-0,False,notdetectable
96854,FOIA_2021-F-05932_FY2021_Q1,11373,H-300-20318-911092,Pay Deductions - California Tax ID,A.11,51093243,False,notdetectable
97551,FOIA_2021-F-05932_FY2021_Q1,12070,H-300-20307-895674,Pay Deductions - California Tax ID,A.11,36811373,False,notdetectable
98132,FOIA_2021-F-05932_FY2021_Q1,12651,H-300-20311-903422,Referral and Hiring Instructions - CA Tax ID,G.1,120-9764-8,False,notdetectable


99538

## Generate test case to check whether the splitting and regex works well 


In [253]:
## Get the cases where there's more than 1 languages so we can check later after extraction
## H-300-19277-071813, H-300-19277-071813
addendum_wdetails_cleaned[addendum_wdetails_cleaned['all'].map(len) >2]

## test language detection code on a couple examples
test = addendum_wdetails_cleaned.loc[addendum_wdetails_cleaned.CASE_NUMBER.isin(["H-300-19277-071813",
                                                     "H-300-19277-071813"])].copy()
test.head()

## Clean out the punctuation
test["all"]=test["all"].str.replace('[^\w.]+',' ')

## replace more than one white space as one white space 
## (the reason why I am doing this is because if not then when extract
## the different values, the white space would count as one value)
test['all'] = test['all'].apply(lambda x: ' '.join(str(x).split()))

## Split by single white space so can ultimately extract three different lang
test["all"]=test["all"].str.split(" ")

test["lang1"] = test["all"].str[0]
test["lang1_prob"] = test["all"].str[1]
test["lang2"] = test["all"].str[2]
test["lang2_prob"] = test["all"].str[3]
## Seems to work well :)
test.head()


Unnamed: 0,level_0,level_1,CASE_NUMBER,SECTION_NAME,SECTION_NUMBER,SECTION_DETAILS,is_missing_all,all
0,FOIA_2021-F-05932_FY2020,0,H-300-19274-066174,Meal Provision,E.1,Employer will furnish free and convenient cook...,False,"['[en', '0.9999991220220414]']"
1,FOIA_2021-F-05932_FY2020,1,H-300-19274-066174,Job Requirements,B.6,Three (3) months experience with references re...,False,"['[en', '0.9999959646773686]']"
2,FOIA_2021-F-05932_FY2020,2,H-300-19274-066174,Daily Transportation,F.1,Living & laundry facilities available. Housing...,False,"['[en', '0.9999964269685038]']"
3,FOIA_2021-F-05932_FY2020,3,H-300-19274-066174,Job Requirements,B.6,Should the Employers worker’s compensation ins...,False,"['[en', '0.999994357030667]']"
4,FOIA_2021-F-05932_FY2020,4,H-300-19274-066174,Daily Transportation,F.1,The employer shall provide transportation in t...,False,"['[en', '0.9999982081048706]']"
...,...,...,...,...,...,...,...,...
102312,FOIA_2021-F-05932_FY2021_Q1,16831,H-300-20338-936807,Job Duties - REGLAS DE LA OCUPACIÓN DE LA VIVI...,A.8a,La unidad de vivienda puede ser inspeccionada ...,False,"['[es', '0.9999972389191564]']"
102313,FOIA_2021-F-05932_FY2021_Q1,16832,H-300-20344-946960,Job Duties - A.8a Additional Job Duties Inform...,A.8a,Other Job Specifications Include:\n1.\tThe wor...,False,"['[en', '0.9999959299056533]']"
102314,FOIA_2021-F-05932_FY2021_Q1,16833,H-300-20343-941481,Job Requirements - null,B.6,may experience occasional exposure to hazards ...,False,"['[en', '0.9999948822434462]']"
102315,FOIA_2021-F-05932_FY2021_Q1,16834,H-300-20343-941225,Inbound/Outbound Transportation - Transportation,F.2,Workers who qualify for inbound and/or outboun...,False,"['[en', '0.9999952697843925]']"


Unnamed: 0,level_0,level_1,CASE_NUMBER,SECTION_NAME,SECTION_NUMBER,SECTION_DETAILS,is_missing_all,all
100,FOIA_2021-F-05932_FY2020,100,H-300-19277-071813,Job Requirements,B.6,Employer guarantees to offer workers employmen...,False,"[""['[en'"", "" '0.5714275064811775"", "" es'"", "" '..."
101,FOIA_2021-F-05932_FY2020,101,H-300-19277-071813,Job Duties,A.8a,Si un trabajador abandona voluntariamente el e...,False,"['[es', '0.999996324226172]']"
102,FOIA_2021-F-05932_FY2020,102,H-300-19277-071813,Job Duties,A.8a,This seasonal housing is offered to you by the...,False,"['[en', '0.9999982462352098]']"
104,FOIA_2021-F-05932_FY2020,104,H-300-19277-071813,Job Duties,A.8a,El empleador acuerda pagar a los trabajadores ...,False,"['[es', '0.999998189717358]']"
105,FOIA_2021-F-05932_FY2020,105,H-300-19277-071813,Job Duties,A.8a,El empleador acepta proporcionar los siguiente...,False,"['[es', '0.999995977596861]']"


  test["all"]=test["all"].str.replace('[^\w\s]','')


Unnamed: 0,level_0,level_1,CASE_NUMBER,SECTION_NAME,SECTION_NUMBER,SECTION_DETAILS,is_missing_all,all,lang1,lang1_prob,lang2,lang2_prob
100,FOIA_2021-F-05932_FY2020,100,H-300-19277-071813,Job Requirements,B.6,Employer guarantees to offer workers employmen...,False,"[en, 05714275064811775, es, 04285709381703644]",en,5714275064811775,es,4285709381703644.0
101,FOIA_2021-F-05932_FY2020,101,H-300-19277-071813,Job Duties,A.8a,Si un trabajador abandona voluntariamente el e...,False,"[es, 0999996324226172]",es,999996324226172,,
102,FOIA_2021-F-05932_FY2020,102,H-300-19277-071813,Job Duties,A.8a,This seasonal housing is offered to you by the...,False,"[en, 09999982462352098]",en,9999982462352098,,
104,FOIA_2021-F-05932_FY2020,104,H-300-19277-071813,Job Duties,A.8a,El empleador acuerda pagar a los trabajadores ...,False,"[es, 0999998189717358]",es,999998189717358,,
105,FOIA_2021-F-05932_FY2020,105,H-300-19277-071813,Job Duties,A.8a,El empleador acepta proporcionar los siguiente...,False,"[es, 0999995977596861]",es,999995977596861,,


## Apply the Regex and Splitting to the Job Postings

In [254]:
# Clean out the punctuation
addendum_wdetails_cleaned["all"]=addendum_wdetails_cleaned["all"].str.replace('[^\w.]+',' ')
addendum_wdetails_cleaned.head()

## replace more than one white space as one white space 
## (the reason why I am doing this is because if not then when extract
## the different values, the white space would count as one value)
addendum_wdetails_cleaned['all'] = addendum_wdetails_cleaned['all'].apply(lambda x: ' '.join(str(x).split()))

## Split by white space so can ultimately extract three different lang
addendum_wdetails_cleaned["all"]=addendum_wdetails_cleaned["all"].str.split(" ")
addendum_wdetails_cleaned.head()

## Extract into different columns
addendum_wdetails_cleaned["lang1"] = addendum_wdetails_cleaned["all"].str[0]
addendum_wdetails_cleaned["lang1_prob"] = addendum_wdetails_cleaned["all"].str[1]
addendum_wdetails_cleaned["lang2"] = addendum_wdetails_cleaned["all"].str[2]
addendum_wdetails_cleaned["lang2_prob"] = addendum_wdetails_cleaned["all"].str[3]
addendum_wdetails_cleaned["lang3"] = addendum_wdetails_cleaned["all"].str[4]
addendum_wdetails_cleaned["lang3_prob"] = addendum_wdetails_cleaned["all"].str[5]
addendum_wdetails_cleaned.head()



  addendum_wdetails_cleaned["all"]=addendum_wdetails_cleaned["all"].str.replace('[^\w.]+',' ')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  addendum_wdetails_cleaned["all"]=addendum_wdetails_cleaned["all"].str.replace('[^\w.]+',' ')


Unnamed: 0,level_0,level_1,CASE_NUMBER,SECTION_NAME,SECTION_NUMBER,SECTION_DETAILS,is_missing_all,all
0,FOIA_2021-F-05932_FY2020,0,H-300-19274-066174,Meal Provision,E.1,Employer will furnish free and convenient cook...,False,en 0.9999991220220414
1,FOIA_2021-F-05932_FY2020,1,H-300-19274-066174,Job Requirements,B.6,Three (3) months experience with references re...,False,en 0.9999959646773686
2,FOIA_2021-F-05932_FY2020,2,H-300-19274-066174,Daily Transportation,F.1,Living & laundry facilities available. Housing...,False,en 0.9999964269685038
3,FOIA_2021-F-05932_FY2020,3,H-300-19274-066174,Job Requirements,B.6,Should the Employers worker’s compensation ins...,False,en 0.999994357030667
4,FOIA_2021-F-05932_FY2020,4,H-300-19274-066174,Daily Transportation,F.1,The employer shall provide transportation in t...,False,en 0.9999982081048706


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  addendum_wdetails_cleaned['all'] = addendum_wdetails_cleaned['all'].apply(lambda x: ' '.join(str(x).split()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  addendum_wdetails_cleaned["all"]=addendum_wdetails_cleaned["all"].str.split(" ")


Unnamed: 0,level_0,level_1,CASE_NUMBER,SECTION_NAME,SECTION_NUMBER,SECTION_DETAILS,is_missing_all,all
0,FOIA_2021-F-05932_FY2020,0,H-300-19274-066174,Meal Provision,E.1,Employer will furnish free and convenient cook...,False,"[en, 0.9999991220220414]"
1,FOIA_2021-F-05932_FY2020,1,H-300-19274-066174,Job Requirements,B.6,Three (3) months experience with references re...,False,"[en, 0.9999959646773686]"
2,FOIA_2021-F-05932_FY2020,2,H-300-19274-066174,Daily Transportation,F.1,Living & laundry facilities available. Housing...,False,"[en, 0.9999964269685038]"
3,FOIA_2021-F-05932_FY2020,3,H-300-19274-066174,Job Requirements,B.6,Should the Employers worker’s compensation ins...,False,"[en, 0.999994357030667]"
4,FOIA_2021-F-05932_FY2020,4,H-300-19274-066174,Daily Transportation,F.1,The employer shall provide transportation in t...,False,"[en, 0.9999982081048706]"


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  addendum_wdetails_cleaned["lang1"] = addendum_wdetails_cleaned["all"].str[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  addendum_wdetails_cleaned["lang1_prob"] = addendum_wdetails_cleaned["all"].str[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  addendum_wdetails_cleaned["lang2"] = addendum_

Unnamed: 0,level_0,level_1,CASE_NUMBER,SECTION_NAME,SECTION_NUMBER,SECTION_DETAILS,is_missing_all,all,lang1,lang1_prob,lang2,lang2_prob,lang3,lang3_prob
0,FOIA_2021-F-05932_FY2020,0,H-300-19274-066174,Meal Provision,E.1,Employer will furnish free and convenient cook...,False,"[en, 0.9999991220220414]",en,0.9999991220220414,,,,
1,FOIA_2021-F-05932_FY2020,1,H-300-19274-066174,Job Requirements,B.6,Three (3) months experience with references re...,False,"[en, 0.9999959646773686]",en,0.9999959646773686,,,,
2,FOIA_2021-F-05932_FY2020,2,H-300-19274-066174,Daily Transportation,F.1,Living & laundry facilities available. Housing...,False,"[en, 0.9999964269685038]",en,0.9999964269685038,,,,
3,FOIA_2021-F-05932_FY2020,3,H-300-19274-066174,Job Requirements,B.6,Should the Employers worker’s compensation ins...,False,"[en, 0.999994357030667]",en,0.999994357030667,,,,
4,FOIA_2021-F-05932_FY2020,4,H-300-19274-066174,Daily Transportation,F.1,The employer shall provide transportation in t...,False,"[en, 0.9999982081048706]",en,0.9999982081048706,,,,


In [255]:
## Check again!
addendum_wdetails_cleaned[addendum_wdetails_cleaned['all'].map(len) >2]

Unnamed: 0,level_0,level_1,CASE_NUMBER,SECTION_NAME,SECTION_NUMBER,SECTION_DETAILS,is_missing_all,all,lang1,lang1_prob,lang2,lang2_prob,lang3,lang3_prob
100,FOIA_2021-F-05932_FY2020,100,H-300-19277-071813,Job Requirements,B.6,Employer guarantees to offer workers employmen...,False,"[en, 0.5714275064811775, es, 0.4285709381703644]",en,0.5714275064811775,es,0.4285709381703644,,
116,FOIA_2021-F-05932_FY2020,116,H-300-19277-071813,Job Duties,A.8a,El empleador asegura que esta oportunidad de t...,False,"[en, 0.5714285380388977, es, 0.4285694805272257]",en,0.5714285380388977,es,0.4285694805272257,,
119,FOIA_2021-F-05932_FY2020,119,H-300-19277-071813,Inbound/Outbound Transportation,F.2,For those workers recruited outside the area o...,False,"[en, 0.7142817868181305, es, 0.2857171496526972]",en,0.7142817868181305,es,0.2857171496526972,,
120,FOIA_2021-F-05932_FY2020,120,H-300-19277-071813,Job Requirements,B.6,"The employer will report workers who, a) volun...",False,"[en, 0.7142848289210836, es, 0.28571358768467287]",en,0.7142848289210836,es,0.28571358768467287,,
123,FOIA_2021-F-05932_FY2020,123,H-300-19277-071813,Daily Transportation,F.1,Transportation between living quarters and wor...,False,"[en, 0.5714281522430635, es, 0.428571099311984]",en,0.5714281522430635,es,0.428571099311984,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102226,FOIA_2021-F-05932_FY2021_Q1,16745,H-300-20338-936807,Job Duties - null,A.8a,Employer guarantees to offer workers employmen...,False,"[en, 0.57142746869369, es, 0.42857092914894424]",en,0.57142746869369,es,0.42857092914894424,,
102234,FOIA_2021-F-05932_FY2021_Q1,16753,H-300-20338-936807,Job Duties - null,A.8a,"Daily individual work assignments, crew assign...",False,"[es, 0.7142812999546606, en, 0.2857166927605691]",es,0.7142812999546606,en,0.2857166927605691,,
102262,FOIA_2021-F-05932_FY2021_Q1,16781,H-300-20339-938641,Job Duties - Other Terms and Conditions,A.8a,Material terms and conditions of the work cont...,False,"[es, 0.7142831831529348, en, 0.2857150822249374]",es,0.7142831831529348,en,0.2857150822249374,,
102286,FOIA_2021-F-05932_FY2021_Q1,16805,H-300-20335-930680,Job Duties - null,A.8a,SUBSTANCE ABUSE POLICY: The use or possession ...,False,"[es, 0.8571389803077235, en, 0.14285797864426947]",es,0.8571389803077235,en,0.14285797864426947,,
