In [5]:
# modules we'll use
import pandas as pd
import numpy as np
import os
# helpful modules
import fuzzywuzzy
from fuzzywuzzy import process
import chardet

In [6]:
# look at the first ten thousand bytes to guess the character encoding
with open(os.path.join(os.getcwd(),"Data/Day4/PakistanSuicideAttacks Ver 11 (30-November-2017).csv"), 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))

# check what the character encoding might be
print(result)

{'confidence': 0.73, 'language': '', 'encoding': 'Windows-1252'}


In [7]:
# read in our dat
suicide_attacks = pd.read_csv(os.path.join(os.getcwd(),"Data/Day4/PakistanSuicideAttacks Ver 11 (30-November-2017).csv"), 
                              encoding='Windows-1252')

In [8]:
from IPython.display import display

In [9]:
display(suicide_attacks.head())

Unnamed: 0,S#,Date,Islamic Date,Blast Day Type,Holiday Type,Time,City,Latitude,Longitude,Province,...,Targeted Sect if any,Killed Min,Killed Max,Injured Min,Injured Max,No. of Suicide Blasts,Explosive Weight (max),Hospital Names,Temperature(C),Temperature(F)
0,1,Sunday-November 19-1995,25 Jumaada al-THaany 1416 A.H,Holiday,Weekend,,Islamabad,33.718,73.0718,Capital,...,,14.0,15.0,,60,2.0,,,15.835,60.503
1,2,Monday-November 6-2000,10 SHa`baan 1421 A.H,Working Day,,,Karachi,24.9918,66.9911,Sindh,...,,,3.0,,3,1.0,,,23.77,74.786
2,3,Wednesday-May 8-2002,25 safar 1423 A.H,Working Day,,7:45 AM,Karachi,24.9918,66.9911,Sindh,...,Christian,13.0,15.0,20.0,40,1.0,2.5 Kg,1.Jinnah Postgraduate Medical Center 2. Civil ...,31.46,88.628
3,4,Friday-June 14-2002,3 Raby` al-THaany 1423 A.H,Working Day,,11:10:00 AM,Karachi,24.9918,66.9911,Sindh,...,Christian,,12.0,,51,1.0,,,31.43,88.574
4,5,Friday-July 4-2003,4 Jumaada al-awal 1424 A.H,Working Day,,,Quetta,30.2095,67.0182,Baluchistan,...,Shiite,44.0,47.0,,65,1.0,,1.CMH Quetta \n2.Civil Hospital 3. Boland Medi...,33.12,91.616


In [10]:
# get all the unique values in the 'City' column
cities = suicide_attacks['City'].unique()

# sort them alphabetically and then take a closer look
cities.sort()
cities

array([u'ATTOCK', u'Attock ', u'Bajaur Agency', u'Bannu', u'Bhakkar ',
       u'Buner', u'Chakwal ', u'Chaman', u'Charsadda', u'Charsadda ',
       u'D. I Khan', u'D.G Khan', u'D.G Khan ', u'D.I Khan', u'D.I Khan ',
       u'Dara Adam Khel', u'Dara Adam khel', u'Fateh Jang',
       u'Ghallanai, Mohmand Agency ', u'Gujrat', u'Hangu', u'Haripur',
       u'Hayatabad', u'Islamabad', u'Islamabad ', u'Jacobabad',
       u'KURRAM AGENCY', u'Karachi', u'Karachi ', u'Karak', u'Khanewal',
       u'Khuzdar', u'Khyber Agency', u'Khyber Agency ', u'Kohat',
       u'Kohat ', u'Kuram Agency ', u'Lahore', u'Lahore ', u'Lakki Marwat',
       u'Lakki marwat', u'Lasbela', u'Lower Dir', u'MULTAN', u'Malakand ',
       u'Mansehra', u'Mardan', u'Mohmand Agency', u'Mohmand Agency ',
       u'Mohmand agency', u'Mosal Kor, Mohmand Agency', u'Multan',
       u'Muzaffarabad', u'North Waziristan', u'North waziristan',
       u'Nowshehra', u'Orakzai Agency', u'Peshawar', u'Peshawar ',
       u'Pishin', u'Poonch', 

In [11]:
# convert to lower case
suicide_attacks['City'] = suicide_attacks['City'].str.lower()
# remove trailing white spaces
suicide_attacks['City'] = suicide_attacks['City'].str.strip()

In [12]:
# get all the unique values in the 'City' column
cities = suicide_attacks['City'].unique()

# sort them alphabetically and then take a closer look
cities.sort()
cities

array([u'attock', u'bajaur agency', u'bannu', u'bhakkar', u'buner',
       u'chakwal', u'chaman', u'charsadda', u'd. i khan', u'd.g khan',
       u'd.i khan', u'dara adam khel', u'fateh jang',
       u'ghallanai, mohmand agency', u'gujrat', u'hangu', u'haripur',
       u'hayatabad', u'islamabad', u'jacobabad', u'karachi', u'karak',
       u'khanewal', u'khuzdar', u'khyber agency', u'kohat',
       u'kuram agency', u'kurram agency', u'lahore', u'lakki marwat',
       u'lasbela', u'lower dir', u'malakand', u'mansehra', u'mardan',
       u'mohmand agency', u'mosal kor, mohmand agency', u'multan',
       u'muzaffarabad', u'north waziristan', u'nowshehra',
       u'orakzai agency', u'peshawar', u'pishin', u'poonch', u'quetta',
       u'rawalpindi', u'sargodha', u'sehwan town', u'shabqadar-charsadda',
       u'shangla', u'shikarpur', u'sialkot', u'south waziristan',
       u'sudhanoti', u'sukkur', u'swabi', u'swat', u'taftan',
       u'tangi, charsadda district', u'tank', u'taunsa', u'tirah 

In [13]:
# get the top 10 closest matches to "d.i khan"
matches = fuzzywuzzy.process.extract("d.i khan", cities, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

# take a look at them
matches

[(u'd. i khan', 100),
 (u'd.i khan', 100),
 (u'd.g khan', 88),
 (u'khanewal', 50),
 (u'sudhanoti', 47),
 (u'hangu', 46),
 (u'kohat', 46),
 (u'dara adam khel', 45),
 (u'chaman', 43),
 (u'mardan', 43)]

In [14]:
# function to replace rows in the provided column of the provided dataframe
# that match the provided string above the provided ratio with the provided string
def replace_matches_in_column(df, column, string_to_match, min_ratio = 90):
    # get a list of unique strings
    strings = df[column].unique()
    
    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                         limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

    # only get matches with a ratio > 90
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches 
    df.loc[rows_with_matches, column] = string_to_match
    
    # let us know the function's done
    print("All done!")

In [15]:
# use the function we just wrote to replace close matches to "d.i khan" with "d.i khan"
replace_matches_in_column(df=suicide_attacks, column='City', string_to_match="d.i khan")

All done!


In [16]:
# get all the unique values in the 'City' column
cities = suicide_attacks['City'].unique()

# sort them alphabetically and then take a closer look
cities.sort()
cities

array([u'attock', u'bajaur agency', u'bannu', u'bhakkar', u'buner',
       u'chakwal', u'chaman', u'charsadda', u'd.g khan', 'd.i khan',
       u'dara adam khel', u'fateh jang', u'ghallanai, mohmand agency',
       u'gujrat', u'hangu', u'haripur', u'hayatabad', u'islamabad',
       u'jacobabad', u'karachi', u'karak', u'khanewal', u'khuzdar',
       u'khyber agency', u'kohat', u'kuram agency', u'kurram agency',
       u'lahore', u'lakki marwat', u'lasbela', u'lower dir', u'malakand',
       u'mansehra', u'mardan', u'mohmand agency',
       u'mosal kor, mohmand agency', u'multan', u'muzaffarabad',
       u'north waziristan', u'nowshehra', u'orakzai agency', u'peshawar',
       u'pishin', u'poonch', u'quetta', u'rawalpindi', u'sargodha',
       u'sehwan town', u'shabqadar-charsadda', u'shangla', u'shikarpur',
       u'sialkot', u'south waziristan', u'sudhanoti', u'sukkur', u'swabi',
       u'swat', u'taftan', u'tangi, charsadda district', u'tank',
       u'taunsa', u'tirah valley', u'tota