# **Data Cleaning Exercise**

In [None]:
#All our necesary libraries and the data
import pandas as pd
import numpy as np

#read in the data
nfl_data = pd.read_csv("../input/nflplaybyplay2009to2016/NFL Play by Play 2009-2017 (v4).csv")

#set seed for reproducibility
np.random.seed(0) 

In [None]:
# Lets check the head of the data
# We can already see a few data points missing
nfl_data.head()

In [None]:
#Check the info on the data
nfl_data.info()

In [None]:
#How much data is missing
missing_vals = nfl_data.isnull().sum()

#Take a look at the first 15 columns
missing_vals[0:15]

In [None]:
#What is the percentage missing cells

total_cells = np.product(nfl_data.shape)
total_missing = missing_vals.sum()

percent_missing = (total_missing/total_cells)*100
print ('{}%'.format(percent_missing))

**That implies about a quarter of the data is missing. That is a lot**

In [None]:
#Several ways to go about handling our missing values
#One of the less recommended approaches but an easy quick approach nonetheless is removing all the unavailable data
#Take note to remove data accross columns not the rows
dropped_data = nfl_data.dropna(axis=1)
dropped_data.head()

In [None]:
#Lets confirm if really all na_values were dropped
dropped_data.isnull().sum()

In [None]:
#Checking the data, we can see although all na_values were dropped successfully, 61 columns were lost in the process
print('Columns of original data: %d \n' % nfl_data.shape[1])
print('Columns with dropped data: %d \n' % dropped_data.shape[1])
print('Amount of lost data: %d' % (nfl_data.shape[1] - dropped_data.shape[1]))

**Let's try to break down our data into a smaller subset for better usability**

In [None]:
#Get a small subset of the NFL dataset
subset_nfl_data = nfl_data.loc[:, 'EPA':'Season'].head()
subset_nfl_data

In [None]:
#Let's attempt filling all the missing values with 0
subset_nfl_data.fillna(0)

In [None]:
#Another method will be to fill all na_values with values that comes directly after it in the same column
#Fill any remaining na_value with 0
subset_nfl_data.fillna(method='bfill', axis=0).fillna(0)

# **Let's attempt another dataset**

In [None]:
#read in the data and check the head
frisco_permits = pd.read_csv('../input/building-permit-applications-data/Building_Permits.csv')
frisco_permits.head()

In [None]:
frisco_permits.info()

In [None]:
#set seed for reproducibility
np.random.seed(0)

In [None]:
#Amount of missing values
missing_values = frisco_permits.isnull().sum()
missing_values

In [None]:
#Percentage missing values
total_cells = np.product(frisco_permits.shape)
total_missing_values = missing_values.sum()

percent_missing = (total_missing_values/total_cells)*100
print('The percentage of missing values is %.2f'%percent_missing+ '%')

**26.26% is more than a quarter of our data missing**

In [None]:
frisco_permits[['Street Number Suffix','Zipcode']].isnull().sum()

In [None]:
#Lets try a rows missing values removal
frisco_na_rows_dropped = frisco_permits.dropna(axis=0)
frisco_na_rows_dropped

**All the rows in our dataset have na_values hence the dropping of all the data**

In [None]:
#Lets try a columns missing values removal
frisco_na_cols_dropped = frisco_permits.dropna(axis=1)
frisco_na_cols_dropped

In [None]:
frisco_na_cols_dropped.info()

In [None]:
print('We lost {} columns'.format(frisco_permits.shape[1] - frisco_na_cols_dropped.shape[1]))

In [None]:
#Using bfill to fill in NaN values
frisco_permits_with_na_imputed = frisco_permits.fillna(method='bfill', axis=0).fillna(0)
frisco_permits_with_na_imputed

In [None]:
frisco_permits_with_na_imputed.isnull().sum()

# **SCALING AND NORMALIZATION** 

**Scaling in Data analysis refers to changing numeric values to fit in a range of 0 - 1**

In [None]:
#Let's import our scaling tool and a few visualization libraries
from mlxtend.preprocessing import minmax_scaling 

import seaborn as sns
import matplotlib.pyplot as plt

#Set seed for reproducibility
np.random.seed(0)

In [None]:
#Lets generate data we can scale and normalize so we see it in practice
original_data = np.random.exponential(size=1000)
original_data[0:10]

In [None]:
#Minmax Scaling the original data between 0 and 1
scaled_data = minmax_scaling(original_data, columns=[0])
scaled_data[0:10]

In [None]:
#Lets do a visualization for comparison
fig, ax = plt.subplots(1,2)
sns.histplot(original_data, ax=ax[0], kde=True, alpha=0.5)
ax[0].set_title('Original Data')
sns.histplot(scaled_data, ax=ax[1], kde=True, alpha=0.5)
ax[1].set_title('Scaled Data')

**Normalization is the transformation of a data range into a normal distribution**

In [None]:
#Lets import the Box-Cox Normalization tool
from scipy import stats

#Normalize the original data
normalized_data = stats.boxcox(original_data)

In [None]:
#Lets do a comparison via visualization 
fig, ax = plt.subplots(1,2)
sns.histplot(original_data, ax=ax[0], kde=True, alpha=0.5)
ax[0].set_title('Original Data')
sns.histplot(normalized_data, ax=ax[1], kde=True, alpha=0.5)
ax[1].set_title('Normalized Data')

**Let us attempt the same processes with another dataset**

In [None]:
#Firstly, we'll import all the tools and libraries we need
import numpy as np
import pandas as pd

from mlxtend.preprocessing import minmax_scaling
from scipy import stats

import seaborn as sns
import matplotlib.pyplot as plt

#Read in the data
kickstarters_2017 = pd.read_csv('../input/kickstarter-projects/ks-projects-201801.csv')

#Set seed for reproducibility
np.random.seed(0)

In [None]:
#Lets check out the information and head on our dataframe
kickstarters_2017.info()

In [None]:
kickstarters_2017.head()

**We will be working with the goals of each campaign(usd_goal_real). We will scale and view plots to see differences between and after scaling**

In [None]:
#Get the original data
original_data = pd.DataFrame(kickstarters_2017['usd_goal_real'])

#Scale the original data
scaled_data = minmax_scaling(original_data, columns=['usd_goal_real'])

In [None]:
#Plot the original and scaled data for comparison
fig, ax = plt.subplots(1,2, figsize=(15,6))
sns.distplot(kickstarters_2017.usd_goal_real, ax=ax[0], kde=True)
ax[0].set_title('Original Data')
sns.distplot(scaled_data, ax=ax[1], kde=True)
ax[1].set_title('Scaled Data')

In [None]:
#Lets do a print of the original data vs scaled data
print('Original Data\nPreview:\n', original_data.head())
print('Minimum value: ', float(original_data.min()),'\nMaximum value: ', float(original_data.max()))
print('='*30)
print('Scaled Data\nPreview:\n', scaled_data.head())
print('Minimum value:', float(scaled_data.min()),'\nMaximum value:', float(scaled_data.max()))

> **Now, let us scale and normalize the goal and usd_pledged columns resp.**

In [None]:
#Select the goal column and transform to a dataframe
original_goal_data = pd.DataFrame(kickstarters_2017.goal)

#Scale the data 
scaled_goal_data = minmax_scaling(original_goal_data, columns=['goal'])

In [None]:
#Let us visualize
fig, ax = plt.subplots(1,2,figsize=(10,3))
sns.distplot(original_goal_data, ax=ax[0])
ax[0].set_title('Original Goal Data')
sns.distplot(scaled_goal_data, ax=ax[1])
ax[1].set_title('Scaled Goal Data')

In [None]:
#Now, let us normalize the usd_pledged column
pledges = (kickstarters_2017.usd_pledged_real)
#normalized_pledges = stats.boxcox(pledges)

#NOTE: Box-Cox only accepts +ve data
#Lets get all the positive pledges, we'll do this by getting their indexes and that to get the values
index_pos_pledge = kickstarters_2017.usd_pledged_real > 0
positive_pledges = kickstarters_2017.usd_pledged_real.loc[index_pos_pledge]

# Normalizing the positive pledges with Box-Cox
normalized_pledges = pd.Series(stats.boxcox(positive_pledges)[0],
                              name='usd_pledged_real',
                              index=positive_pledges.index)

In [None]:
#Visualize the original and normalized data
fig, ax = plt.subplots(1,2,figsize=(10,3))
sns.distplot(positive_pledges, ax=ax[0])
ax[0].set_title('Original Data')
sns.distplot(normalized_pledges, ax=ax[1])
ax[1].set_title('Normalized Data')

In [None]:
print('Original data\nPreview:\n', positive_pledges.head())
print('Minimum value:', float(positive_pledges.min()),
      '\nMaximum value:', float(positive_pledges.max()))
print('='*30)

print('Normalized data\nPreview:\n', normalized_pledges.head())
print('Minimum value:', float(normalized_pledges.min()),
      '\nMaximum value:', float(normalized_pledges.max()))

# **PARSING DATES**

In [None]:
#We already have a couple of our needed libraries loaded, let's load one more for this exercise and read in the data

import datetime

landslides = pd.read_csv('../input/landslide-events/catalog.csv')

#Set seed for reproducibility
np.random.seed(0)

In [None]:
#Let's check our data out
landslides.head()

In [None]:
landslides.info()

In [None]:
print(landslides['date'])

In [None]:
landslides['date'].dtype

In [None]:
#The date is an object as can be seen from the datatype. Lets parse it to a datetime object
landslides['new_date'] = pd.to_datetime(landslides['date'], format='%m/%d/%y')

In [None]:
#Lets check our new column out
landslides['new_date'].head()

**In the case where you get a situation where there are multiple formats of date entered into the column, the code can go like this**

**landslides['new_date'] = pd.datetime(landslides['date'], infer_datetime_format=True)**

In [None]:
#From a datetime object, we can easily extract elements like day, month or the year
#Let us get the day of the month from our new column
#The same thing from the original date column would have given an error

day_of_the_month = landslides['new_date'].dt.day
day_of_the_month.head()

In [None]:
#Lets remove the na's
day_of_the_month.isnull().sum()
day_of_the_month = day_of_the_month.dropna()

sns.distplot(day_of_the_month, kde=False, bins=31)

**Lets try our hands at another Date Parsing exercise**

In [None]:
#Lets read in our data 

earthquakes = pd.read_csv('../input/earthquake-database/database.csv')
np.random.seed(0)
earthquakes.head()

In [None]:
#Check the Date column to see if its the right datatype
print(earthquakes['Date'])

In [None]:
#Change the Date column to a datetime object
#earthquakes['new_date'] = pd.to_datetime(earthquakes['Date'], format='%m/%d/%y')
#earthquakes['new_date'].head()

earthquakes['new_date'] = pd.to_datetime(earthquakes['Date'], infer_datetime_format=True,
                                        format='%m/%d/%y')
earthquakes['new_date'].head()

In [None]:
#We can see from the results that the new columnm is still an object meaning there is some data not conforming to the datetime change
#One way we can investigate is to check the length of data under Date to verify any discrepancies

date_len = earthquakes.Date.str.len()
date_len.value_counts()

In [None]:
#From the previous result, 23,409 data have 10 characters and 3 have 24. Lets located the three
indices = np.where([date_len == 24])[1]
print('Indices with discrepancies:',indices)
earthquakes.loc[indices]

In [None]:
#Now that we have identified where the discrepancies lie, we can manually change the data 
earthquakes.loc[3378,'Date'] = '02/23/1975'
earthquakes.loc[7512, 'Date'] = '04/28/1985'
earthquakes.loc[20650, 'Date'] = '03/11/2011'

#Let us recreate the new column and check for datatype
earthquakes['new_date'] = pd.to_datetime(earthquakes['Date'], format='%m/%d/%Y')
print(earthquakes['new_date'])

In [None]:
#Select day of the month and plot
day_of_month = earthquakes['new_date'].dt.day

sns.displot(day_of_month, bins=30, kde=True)