Text analysis module difflib comes with Python 3.
If you don't already have pandas, install by running either "pip install pandas" or "conda install pandas" (if you're installing with Anaconda) in the command line.  See [here](https://docs.python.org/3/installing/index.html) for more info.

In [1]:
from difflib import SequenceMatcher
import difflib
import pandas as pd
import numpy as np

In [2]:
# Display all outputs from a chunk, not just last ouptut
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
# Import cleaned text descriptions
proj_descs = pd.read_csv("Text descriptions differ.csv")

In [68]:
# Create variable with match ratio of two strings

match_ratio = [] # empty list

for i in range(len(proj_descs)):
    # Fill empty list with dictionary containing Project ID and match ratio between two descriptions
    match_ratio.append({SequenceMatcher(None, proj_descs['Draft_desc'][i], proj_descs['Final_desc'][i]).ratio(), proj_descs['Projnum'][i]})
    
    # Convert dictionary to dataframe, with columns "Match_ratio" and "Projnum"
    ratio_df = pd.DataFrame(match_ratio,
                           columns = ['Match_ratio',
                                     'Projnum'])

In [69]:
# Check outputs
ratio_df

Unnamed: 0,Match_ratio,Projnum
0,0.970874,880M-CA-20A
1,0.970874,880M-CA-21A
2,0.970874,880M-CA-22A
3,0.660714,071-070-040
4,0.889908,086-619-034
5,0.855615,7102-135
6,0.80597,2710-47B
7,0.96,002-614-044
8,0.97019,027-681-035
9,0.980392,2750-89


Using a similar method to 'mutate' is preferable, since we want the original dataframe with the additional variable.  Do this with lambda.

In [5]:
proj_descs['match_ratio'] = proj_descs.apply(lambda proj_descs:
                                             SequenceMatcher(None,
                                                             proj_descs['Draft_desc'],
                                                             proj_descs['Final_desc']).ratio(), axis = 1)

In [6]:
proj_descs.head(10)

Unnamed: 0,Projnum,Draft_desc,Final_desc,Draft_total,Final_total,match_ratio
0,880M-CA-20A,17new districtwide setaside project delivery ...,17new districtwide setaside project delivery ...,5800000,5800000,0.970874
1,880M-CA-21A,17new districtwide setaside project delivery ...,17new districtwide setaside project delivery ...,3050000,3050000,0.970874
2,880M-CA-22A,17new districtwide setaside project delivery ...,17new districtwide setaside project delivery ...,3150000,3150000,0.970874
3,071-070-040,"sherburne csah 13, construct roundabout at cr ...","ac sherburne csah 13, construct roundabout at...",1000000,2250000,0.660714
4,086-619-034,"wright county csah 19, from lamplight dr to n ...","ac wright county csah 19, from lamplight dr t...",5000000,5000000,0.889908
5,7102-135,"us 10, from xenia ave st to norfolk ave in elk...","us 10, from xenia ave st to norfolk ave in elk...",8750000,8750000,0.855615
6,2710-47B,"cocii mn65, at bridge #2440 (3rd ave s) over ...","cmgc mn65, at bridge #2440 (3rd ave s) over m...",701552,32100000,0.80597
7,002-614-044,"ac csah 14, 0.15 mi e of csah 18, bridge 0201...","ac csah 14, 0.15 mi e of csah 18, bridge 0201...",800000,1500000,0.96
8,027-681-035,"csah 81, 0.04 mile n of 71st ave (csah 8) to 0...","csah 81, 0.04 mile n of 71st ave (csah 8) to 0...",20421000,20421000,0.97019
9,2750-89,"ada us169, at hennepin csah 130 (77th ave n/e...","us169, at hennepin csah 130 (77th ave n/elm cr...",977000,977000,0.980392


Is it possible to simply look at the differences between the two strings?

In [116]:
for line in enumerate(difflib.ndiff(proj_descs['Draft_desc'], proj_descs['Final_desc'])):
    print (line)

-  17new districtwide setaside project delivery fy20
+  17new districtwide setaside project delivery fy 2020
?                                                 +  ++

-  17new districtwide setaside project delivery fy21
+  17new districtwide setaside project delivery fy 2021
?                                                 +++

-  17new districtwide setaside project delivery fy22
+  17new districtwide setaside project delivery fy 2022
?                                                 +++

- sherburne csah 13, construct roundabout at cr 40 intersection in elk river
+  ac sherburne csah 13, construct roundabout at sherburne cr 40 intersection and construct roundabout at sherburne co csah 33 intersection in elk river
- wright county csah 19, from lamplight dr to n of 70th st in albertville, extend multilane roadway
+  ac wright county csah 19, from lamplight dr to n of 70th st in albertville, extend multilane roadway(ac payback in 2020)
? ++++                                              

There are quite a few strings where a couple additions at the beginning of the string shift the entire statement downward, making everything look like an addition.  Instead, collect blocks where text is the same, and then remove these blocks from the original strings to obtain the difference between the two.

In [168]:
output_list = [li for li in difflib.ndiff(proj_descs['Draft_desc'][3], proj_descs['Final_desc'][3]) if li[0] != ' ']
words = [w.replace(' ', '').replace('+', '') for w in output_list]
#words
merged_list = [''.join(words)]
merged_list

#str_no_plus = merged_list[0].replace('+', '')
#str_no_plus.replace(' ', '')

['acsherburneintersectionandconstructroundaboutatsherburnecocsah33']

In [102]:
string1 = proj_descs['Draft_desc'][8]
string2 = proj_descs['Final_desc'][8]
projnum = proj_descs['Projnum'][8]

matches = difflib.SequenceMatcher(
    None, string1, string2).get_matching_blocks()

match_lista = []
match_listb = []
for match in matches:
    match_lista.append(string1[match.a:match.a+match.size])
    match_listb.append(string2[match.b:match.b+match.size])

i = 0
while i <len(match_lista):
    string1 = string1.replace(match_lista[i], '')
    string2 = string2.replace(match_listb[i], '')
    i += 1
    
strings = pd.DataFrame({'Final_draft': [string1],
                        'Draft_final': [string2],
                        'Projnum': [projnum]})
strings

Unnamed: 0,Draft_final,Final_draft,Projnum
0,1n5th,04s3rd,027-681-035


In [103]:
match_lista

['csah 81, 0.04 mile n of 71st ave (csah 8) to 0.',
 ' mile ',
 ' of 8',
 ' ave in brooklyn park reconstruct from four lane divided rural roadway to six lane divided urban roadway, multi use trail',
 '']

That's great!  We got all the code to work for a single row.  Now let's create a function so we can iterate over all the rows in the input df.

In [90]:
# Create a function that can be iterated with
def find_difference(x):
    string1 = proj_descs['Draft_desc'][x]
    string2 = proj_descs['Final_desc'][x]
    projnum = proj_descs['Projnum'][x]

    matches = difflib.SequenceMatcher(
        None, string1, string2).get_matching_blocks()

    match_lista = []
    match_listb = []
    for match in matches:
        match_lista.append(string1[match.a:match.a+match.size])
        match_listb.append(string2[match.b:match.b+match.size])
    
    i = 0
    while i <len(match_lista):
        string1 = string1.replace(match_lista[i], '')
        string2 = string2.replace(match_listb[i], '')
        i += 1
    
    strings = pd.DataFrame({'Deletion_from_draft': [string1],
                        'Addition_in_final': [string2],
                        'Projnum': [projnum]})
    return(strings)

In [69]:
diff_21 = find_difference(21)

In [91]:
differences = []

for i in range(len(proj_descs)):
    differences.append(find_difference(i))

In [92]:
diff_df = pd.concat(differences)

In [93]:
diff_df

Unnamed: 0,Addition_in_final,Deletion_from_draft,Projnum
0,,,880M-CA-20A
0,20,,880M-CA-21A
0,20,,880M-CA-22A
0,ac sherburne intersection and construct roun...,,071-070-040
0,ac (ac payback in 2020),,086-619-034
0,(including bike/ped trail),,7102-135
0,mg 2440,oiidesign of major struture of,2710-47B
0,"is rehab pier caps, replace eck panels (ac pro...","is rehab pier caps, replace eck panels (ac pro...",002-614-044
0,1n5th,04s3rd,027-681-035
0,,ada,2750-89


In [94]:
proj_comparisons = pd.merge(proj_descs, diff_df, on = 'Projnum')

In [95]:
pd.DataFrame.to_csv(proj_comparisons, "Differing Descriptions Comparison.csv")