# All codes of Data Cleaning mini course by kaggle

In [None]:
import pandas as pd
import numpy as np

In [None]:
nfl_data  = pd.read_csv("../input/nflplaybyplay2009to2016/NFL Play by Play 2009-2017 (v4).csv")
np.random.seed(0)

In [None]:
# to observe if there is any NaN / None in first few cols
nfl_data.head()

In [None]:
# if there is null how many of them at ever col only 10 seen at 2nd line
missing_vals = nfl_data.isnull().sum()
missing_vals[0:10]

In [None]:
total_missing = missing_vals.sum()
total_missing

In [None]:
total_cells = np.product(nfl_data.shape)
total_cells

In [None]:
nfl_data.shape

In [None]:
407688*102

In [None]:
percent_missing = (total_missing / total_cells) * 100 
percent_missing

In [None]:
nfl_data.dropna()

In [None]:
cols_with_NaN_dropped = nfl_data.dropna(axis=1)
cols_with_NaN_dropped.head()

In [None]:
cols_with_NaN_dropped.columns.size

In [None]:
cols_with_NaN_dropped.columns

In [None]:
nfl_data.columns.size

In [None]:
for i in range (102):
    print(nfl_data.columns[i])

In [None]:
print("original dataset columns size %d\n" % nfl_data.shape[1])
print("after dropped cols with NaN columns size %d\n" % cols_with_NaN_dropped.shape[1])

In [None]:
subset_nfl_data = nfl_data.loc[:, 'EPA':'Season'].head()
subset_nfl_data

In [None]:
subset_nfl_data.fillna(0)

In [None]:
# fill with the data comes at next column
subset_nfl_data.fillna(method='bfill',axis=0).fillna(0)

scaling and normalization

In [None]:
from scipy import stats
from mlxtend.preprocessing import minmax_scaling
import seaborn as sns
import matplotlib.pyplot as plt
np.random.seed(0)

* in scaling, you're changing the range of your data, while
* in normalization, you're changing the shape of the distribution of your data.

In [None]:
original_data = np.random.exponential(size=1000)
scaled_data = minmax_scaling(original_data, columns=[0])

fig,ax = plt.subplots(1,2)
sns.distplot(original_data, ax=ax[0])
ax[0].set_title("Original data")

sns.distplot(scaled_data, ax=ax[1])
ax[1].set_title("Scaled data")


**Normalization**
Scaling just changes the range of your data. Normalization is a more radical transformation. The point of normalization is to change your observations so that they can be described as a normal distribution.

*Normal distribution:* Also known as the "bell curve", this is a specific statistical distribution where a roughly equal observations fall above and below the mean, the mean and the median are the same, and there are more observations closer to the mean. The normal distribution is also known as the Gaussian distribution.

In general, you'll only want to normalize your data if you're going to be using a machine learning or statistics technique that assumes your data is normally distributed. Some examples of these include **t-tests, ANOVAs, linear regression, linear discriminant analysis (LDA) and Gaussian naive Bayes****. (Pro tip: any method with "Gaussian" in the name probably assumes normality.)

The method we're using to normalize here is called the **Box-Cox Transformation**. Let's take a quick peek at what normalizing some data looks like:

In [None]:
normalized_data = stats.boxcox(original_data)
print(type(normalized_data))


fig,ax = plt.subplots(1,2)
sns.distplot(original_data, ax=ax[0])
ax[0].set_title("Original data")
sns.distplot(normalized_data[0], ax=ax[1])
ax[1].set_title("Normalized data")

Parse Dates

In [None]:
import datetime
landslides = pd.read_csv("../input/landslide-events/catalog.csv")
np.random.seed(0)

In [None]:
landslides.head()

In [None]:
landslides.columns

In [None]:
landslides['date'].head()

In [None]:
landslides['date'].dtype # it will print dtype('O') where O = Object {O is a keyword}

In [None]:
landslides['date_parsed'] = pd.to_datetime(landslides['date'], infer_datetime_format=True) 
# infer_datetime_format=True or use the format specified in dataset format = format="%m/%d/%y" 
landslides['date_parsed'].head()

In [None]:
landslides['date_parsed'].dtype

In [None]:
# get the day of the month from the date_parsed column
day_of_month_landslides = landslides['date_parsed'].dt.day
day_of_month_landslides.head()

In [None]:
day_of_month_landslides = day_of_month_landslides.dropna()
sns.distplot(day_of_month_landslides, kde=False, bins=31)

In [None]:
earthquakes = pd.read_csv("../input/earthquake-database/database.csv")
date_lengths = earthquakes.Date.str.len()
date_lengths.value_counts()

here 3 rows got different/corrupted format

In [None]:
indices = np.where([date_lengths == 24])[1]
print('Indices with corrupted data:', indices)
earthquakes.loc[indices]

In [None]:
earthquakes.loc[3378, "Date"] = "02/23/1975"
earthquakes.loc[7512, "Date"] = "04/28/1985"
earthquakes.loc[20650, "Date"] = "03/13/2011"
earthquakes['date_parsed'] = pd.to_datetime(earthquakes['Date'], format="%m/%d/%Y")

In [None]:
earthquakes['date_parsed'] = pd.to_datetime(earthquakes['Date'], infer_datetime_format=True) # it didnt work as the data were corrupted so hardcoded to correct it.  
# infer_datetime_format=True or use the format specified in dataset format = format="%m/%d/%y" 
for i in indices:
    print(earthquakes['date_parsed'][i])

In [None]:
earthquakes.loc[indices]

In [None]:
earthquakes['date_parsed'].dtype

In [None]:
volcanos = pd.read_csv("../input/volcanic-eruptions/database.csv")
volcanos['Last Known Eruption'].sample(5)

https://www.kaggle.com/residentmario/time-series-plotting-optional

In [None]:
import chardet

In [None]:
before = "This is the euro symbol: €"
type(before)

In [None]:
after = before.encode("utf-8", errors="replace")
type(after)

In [None]:
after

\xe2\x82\xac' -> mojibake

In [None]:
print(after.decode("utf-8"))

In [None]:
print(after.decode("ascii")) # as it was utf-8 encoded so ascii decode wont work

In [None]:
before = "This is the euro symbol: €"
after = before.encode("ascii", errors = "replace")
print(after.decode("ascii"))

In [None]:
# it was not encoded in utf-8 and we dont know what it is, so got error
# also default for python/pandas to decode is utf-8 so utf-8 enocoded reading/decoding works without any issue
kickstarter_2016 = pd.read_csv("../input/kickstarter-projects/ks-projects-201612.csv")

In [None]:
# look at the first ten thousand bytes to guess the character encoding
with open("../input/kickstarter-projects/ks-projects-201801.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(10000))

# check what the character encoding might be
print(result)

So chardet is 73% confidence that the right encoding is "Windows-1252". Let's see if that's correct:

In [None]:
# read in the file with the encoding detected by chardet
kickstarter_2016 = pd.read_csv("../input/kickstarter-projects/ks-projects-201612.csv", encoding='Windows-1252')

# look at the first few lines
kickstarter_2016.head()

In [None]:
# now saving in utf-8
# save our file (will be saved as UTF-8 by default!)
kickstarter_2016.to_csv("ks-projects-201801-utf8.csv")

In [None]:
sample_entry = b'\xa7A\xa6n'
print(sample_entry)
print('data type:', type(sample_entry))

In [None]:
result = chardet.detect(sample_entry)
result

In [None]:
before = sample_entry.decode('big5-tw')
new_entry = before.encode("utf-8", errors="replace")
print(new_entry)
print('data type:', type(new_entry))


In [None]:
result = chardet.detect(new_entry)
result

In [None]:
with open('../input/fatal-police-shootings-in-the-us/PoliceKillingsUS.csv', 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))

print(result)
police_killings = pd.read_csv('../input/fatal-police-shootings-in-the-us/PoliceKillingsUS.csv', encoding='Windows-1252')

In [None]:
police_killings.to_csv('PoliceKillingsUS_utf.csv')

Inconsistent data entry

In [None]:
import fuzzywuzzy
from fuzzywuzzy import process

In [None]:
# read in all our data
suicide_attacks = pd.read_csv("../input/data-cleaning-challenge-inconsistent-data-entry/PakistanSuicideAttacks Ver 11 (30-November-2017).csv", encoding='Windows-1252')

In [None]:
suicide_attacks.head()

we will clean city column

In [None]:
cities = suicide_attacks['City'].unique()
cities.sort()
cities

there are inconsistency e.g. 'Lahore' & 'Lahore ',  'Lakki Marwat' and 'Lakki marwat' etc.

In [None]:
# convert to lower case (solved issues like 'Lakki Marwat' and 'Lakki marwat')
suicide_attacks['City'] = suicide_attacks['City'].str.lower()
# remove trailing white spaces (solved issues like 'Lahore' & 'Lahore ')
suicide_attacks['City'] = suicide_attacks['City'].str.strip()

In [None]:
# get all the unique values in the 'City' column
cities = suicide_attacks['City'].unique()
cities.sort()
cities

**Fuzzy matching:**** The process of automatically finding text strings that are very similar to the target string. In general, a string is considered "closer" to another one the fewer characters you'd need to change if you were transforming one string into another. So "apple" and "snapple" are two changes away from each other (add "s" and "n") while "in" and "on" and one change away (rplace "i" with "o"). You won't always be able to rely on fuzzy matching 100%, but it will usually end up saving you at least a little time.

Fuzzywuzzy returns a ratio given two strings. The closer the ratio is to 100, the smaller the edit distance between the two strings. Here, we're going to get the ten strings from our list of cities that have the closest distance to "d.i khan".

In [None]:
# get the top 10 closest matches to "d.i khan"
matches = fuzzywuzzy.process.extract("d.i khan", cities, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
# take a look at them
matches

In [None]:
def replace_matches_in_column(df, column, string_to_match, min_ratio = 90):
    strings = df[column].unique()  # get a list of unique strings
    matches = fuzzywuzzy.process.extract(string_to_match, strings, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio] # only get matches with a ratio > 90
    rows_with_matches = df[column].isin(close_matches) # get the rows of all the close matches in our dataframe
    df.loc[rows_with_matches, column] = string_to_match # replace all rows with close matches with the input matches 
    print("All done!") # let us know the function's done

In [None]:
replace_matches_in_column(df=suicide_attacks, column='City', string_to_match="d.i khan")

In [None]:
cities = suicide_attacks['City'].unique()
cities.sort()
cities