# Cleaning Data

In [None]:
#Visual Exploratory
#Bar plots are for discrete data counts, great for multiple variables, especially when one is categorical
#Histograms are for continuous data counts, great for single variables
#Scatter plots display relationshop between two numberic variables
#Columns are variables, rows are observations

#Inspecting
.head()
.columns
.info()
.describe() 
.dtypes
.tail()
.shape

df['column'].unique() #Prints unique values
df['column'].nunique() #Returns the number of unique values
df["column"].value_counts(dropna = False) #Totals frequency of each unique value, sorted by highest frequency
df.idmax #(axis='columns')
df.idmin #(axis='columns') 
df.isin([])
            
#Assigning column header
df.columns = ['col1', 'col2']

#Assigning index
pd.read_csv(index_col='col')
df.index = ['column_name']
df.index.name = 'new_name'
df.set_index('col', inplace=True)
df.reset_index()['column']
df.sort_index()
df.reindex()#index values are immutable, but can change out all values at once

#Sorting
df.sort_values('column', ascending=False)

#Changing types
df.column.astype(str)
df.column.astype('category')
pd.to_numeric(df['column'], errors='coerce') #int64, #coerce will turn blank values into NaN
df.values #converts df to numpy array
df.column = pd.Categorical(values=df.column, categories=['Bronze', 'Silver', 'Gold'], ordered=True)

#Duplicate and Missing Data
df.drop_duplicates()
df.dropna(how='any') # for rows #how='all' #thresh=1000
df.drop(list_to_drop, axis='columns')
  
#Filtering 0's and NaN's    
df.all() #all nonzero...filters columns with any '0'
df.any() #any nonzeros...filters columns with all 0's

df.isnull.all() #Filters columns with any NaNs
df.isnull.any() #Filters columns with all NaNs
            
df.notnull.all() 
df.notnull.any()

#Filling NaN
df.column = df.column.fillna('value or function')
df.column = df.column.ffill('value')

#Checking for NaN values w/ asserts
assert pd.notnull(df).all().all()
            
#Filtering and aggregation      
df.groupby(['Columns to group by', 'second column to group by'])  
df.groupby(df2['column']) #then try df['column'].mean()
df.groupby('column')['Column to select']
df.groupby('column')[['Column to select', 'Column to select']].agg(['max', 'sum'])
#Custom aggregations via dictinaries
df.groupby('column')[['col1', 'Col2']].agg({'col1':'max', 'col2':'sum'})  
#or
sets[['year', 'theme_id']].groupby('year', as_index = False).agg({"theme_id": pd.Series.nunique})

In [None]:
#Manipulating DataFrames
#Melting: turns columns into rows
pd.melt(frame = df, id_vars = 'fixed cols', value_vars = 'cols to melt', 
        var_name = 'new col name', value_name = 'new value column name')

pd.melt(df, col_level=0) #returns column at index 0 was key-value pairs

#Pivoting, turning unique values from a column into new columns
df.pivot(index = 'col', columns = 'col', values = 'columns to use as values')

#Pivot Table...Pivot will err when finding duplicate indices, so we use pivot table instead, creates hia index
df.pivot_table(index = 'col', columns = 'col', values = 'col', aggfunc = 'how to deal with dups', margins=True)

#Appending: stacks vertically
df1.append(df2) 

#Concatenating: stacks verticaly or horizontally on the index, supports inner and outer joins
pd.concat([df1, df2], ignore_index = True, axis=0, keys=[], join='inner or outer')

#Join: joins on index, supports inner, outer, right and left
df.join(how='left, right, inner, or outer') #joins on the index

#Merging: Data merges on a specific column of group of colums, supports many joins
pd.merge(left=df_1, right=df_1, left_on='df_1_column', right_on='df_2_column')
pd.merge(df, df2, on=['col1', 'col2', 'col3'], how='inner, outer, right or left')

#Merge and order
pd.merge_ordered('same para as above')

In [None]:
#Concatenating many dataframes by using glob
import glob
import pandas as pd

#Create pattern
pattern = '*.csv'
csv_files = glob.glob(pattern)

# Create an empty list: frames
frames = []

#  Iterate over csv_files
for csv in csv_files:

    #  Read csv into a DataFrame: df
    df = pd.DataFrame(pd.read_csv(csv))
    
    # Append df to frames
    frames.append(df)

# Concatenate frames into a single DataFrame: uber
big_frame = pd.concat(frames)

In [None]:
#String Parsing
#Parsing strings in columns
df['name'] = df.column_name.str[0]

#Splitting column 
    ##Step 1. Split column...this creates a list in the column
df['str_split'] = df['type_and_country'].str.split('_')#.get(0) ....one line

    ##Step2. Create new column from indexing the list that is created from the split
df['type'] = df.str_split.str.get(0)
df['country'] = df.str_split.str.get(1)


#Pattern Matching
# Import the regular expression module
import re
from numpy import NaN

# Compile the pattern: pattern
pattern = re.compile('\d{3}-\d{3}-\d{4}')

# See if the pattern matches
result = prog.match('123-456-7890')
print(bool(result))

# See if the pattern matches
result = prog.match('1123-456-7890')
print(bool(result))

#on one line
(re.compile("\d{3}-\d{3}-\d{4}")(x))


#Finding numberic values
matches = re.findall('\d+', 'the recipe calls for 10 strawberries and 1 banana')

# Print the matches
print(matches)

#A few more patterns
# Write the second pattern
pattern2 = bool(re.match(pattern='\$\d*\.\d{2}', string='$123.45'))
print(pattern2)

# Write the third pattern
pattern3 = bool(re.match(pattern='[A-Z]\w*', string='Australia'))
print(pattern3)

In [None]:
#Using Functions to Clean Data
# Define func
def recode_sex(sex_value):

    # Return 1 if sex_value is 'Male'
    if sex_value == 'Male':
        return 1
    
    # Return 0 if sex_value is 'Female'    
    elif sex_value == 'Female':
        return 0
    
    # Return np.nan    
    else:
        return np.nan

# Apply the function to the sex column using .apply
df['sex_recode'] = df.sex_column.apply(recode_sex, axis=0) #no need to add () after the funciton

#Using Lambdas to Clean Data
# Write the lambda function using replace
df['total_dollar_replace'] = df.total_dollar.apply(lambda x: x.replace('$', ''))

# Write the lambda function using regular expressions
df['total_dollar_re'] = df.total_dollar.apply(lambda x: re.findall('\d+\.\d+', x)[0])

In [None]:
#Arithmatic
df.divide(df2, axis='rows')
df.pct_change() * 100
df.add(df2)
df.add(df2, fill_value=0)
df.multiply(df2, axis='rows')
.mean(axis='columns') #This computes the mean of all columns per row.
df.quantile([0.05, 0.95]#the 5th and 95th percentiles