<font size='6'>**What's included in this notebook**

	1. Load files
	2. Standardize data
		a. Lowercase col
		b. Lowercase str
		c. Remove blank row/duplicates
	3. Merge files
		a. Inconsistent columns
		b. Rename columns
		c. Merge files
	4. Explore data
		a. Check accuracy of dtypes
		b. Review content by eye
	5. Missing data
		a. Check percent of missing values
		b. Remove weak columns
		c. Remove weak rows
	6. Standardize categorical
		a. Check category lists for consistency
		b. Correct cat lists if needed
	7. Fix missing numbers 
		a. Replace missing with mean
	8. Output file to csv

# Load file

In [116]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import quantile_transform
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import OneHotEncoder  ##. better to use dummy from pandas 
from sklearn.preprocessing import PowerTransformer
from scipy.stats import boxcox
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from scipy.stats import boxcox
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
pd.options.display.max_rows = 50
import qgrid
import glob
import math

In [117]:
def load_csv_dataframes(path):
    all_files = glob.glob(path + "/*.csv")
    dataframes_list=[]
    file_count=len(all_files)
    for i in range(file_count):
        temp_df = pd.read_csv(all_files[i])
        dataframes_list.append(temp_df)

    return dataframes_list

# Standardize data

## Lowercase column names

In [118]:
def col_lowercase(df):
    '''Takes one value, dataframe, and changes all columns to lowercase
    For easy comparison and merging of multiple dataframes'''
    df_temp = [i.lower() for i in df.columns]
    return df_temp

## Lowercase all string values

In [119]:
def string_lowercase(df):
    '''Takes one value, dataframe, and changes all values that are strings to all lowercase
    For easy identification of discrete values and for merging of dataframes'''
    df=df.applymap(lambda x:x.lower() if type(x) == str else x)
    return df

## Removing duplicate and blank records

In [120]:
def drop_duplicates_and_blank_lines(df):
    #Removes duplicates if entire row is the same
    df.drop_duplicates(inplace=True)
    #Removes rows if they are completely blank
    df.dropna(how="all",axis=0, inplace=True)
    return df

# Merge files

## Check files for inconsistent columns

In [121]:
def list_unique_columns(df_list):
    #Gather column names from all files in a list

    all_col_names=[]
    temp_col_names=[]
    unmatched_col_names=[]
    unmatched_col_dict={}

    for i in df_list:
        [all_col_names.append(col) for col in i.columns]


    #Remove duplicates to create a unique set of column names
    unique_col_names=list(set(all_col_names))

    #Check which files are missing which column names. Place these in a dictionary with key of file number.
    file_num=1
    for x in df_list:
        [temp_col_names.append(col) for col in x.columns]
        [unmatched_col_names.append(y) for y in unique_col_names if y not in temp_col_names]        
        unmatched_col_dict[file_num]=unmatched_col_names
        unmatched_col_names=[]
        temp_col_names=[]
        file_num+=1

    return unmatched_col_dict

## Rename columns

In [122]:
def rename_columns(df_list):
    continue_loop=True
    wrong_name_list=[]
    right_name_list=[]
    
    while continue_loop==True:
        wrong_name=input('What is the column name that you want to fix? (none if done)')
        if wrong_name.lower()!="none":
            right_name=input('What is the correct column name? (none to cancel)')
            if right_name.lower()!='none':
                wrong_name_list.append(wrong_name)
                right_name_list.append(right_name)
            else:
                continue_loop=False
        else:
            continue_loop=False
    if len(wrong_name_list)>0:
        for i in df_list:
            col_rename_dic=dict(zip(wrong_name_list,right_name_list))
            i.rename(columns=col_rename_dic,inplace=True)
    return df_list


## Merge files

In [123]:
def merge_files(df_list):
    df=pd.concat(df_list,axis=0,ignore_index=True)

# Explore Data

# Missing data

## Check percent of missing values

In [124]:
#Calculate percent of missing values
def count_missing_values(df):
    missing_val=df.isnull().sum()
    all_val=df.count()
    missing_ratio=missing_val/all_val
    return missing_ratio

## Drop columns with too few values

In [125]:
def drop_columns(df,drop_list):
    for i in drop_list:
        df.drop(drop_list, axis = 1, inplace = True) 
        return df

## Drop rows with too many NaNs

In [126]:
#No code as its one line and needs no specific function to run

## Find missing numbers

In [127]:
def get_columns_with_nan(df):
    # This creates a list ## integers_with_nan ## which is a list of numerical columns with blanks in them.  
    col_list=[]
    col_list=df.columns
    integers_with_nan=[]
    for i in col_list:
        if df[i].isnull().values.any()==True:
            if df[i].dtype!='O':
                integers_with_nan.append(i)
    return integers_with_nan

# Standardize categorical

## Check categorical columns for misspellings or issues

In [128]:
# Creates a copy of the dataframe for string-based columns only.  
# Then, creates a dictionary of the column (key) plus array of values (value) to show the results
def check_categoricals(df):
    categorical_df=df.select_dtypes(object)
    unique_dict={col: categorical_df[col].unique() for col in categorical_df}
    return unique_dict

## Standardizing categorical byreplacing values with new ones

In [129]:
def standardize_categorical_names(df,a):
#Requires 'a' to be filled in

# Creates a dictionary 'a' of values to correct.  
# Outer dictionary is the column to correct
# Inner dictionary key is the correct spelling, then list (dictionary value) is the incorrect spellings to find and replace

#This code will run through the dictionary 'a' and update string values in the dataframe
#Using the new string values
    for key in a.keys():
        for key2 in a[key].keys():
            list_values=a[key][key2]
            df.loc[df[str(key)].isin(list_values),key]=str(key2)

# Fix missing numbers

## Replace missing numbers with mean

In [130]:
def replace_nan_integer_with_mean(df,integers_with_nan):
    #Fill missing numbers with the average of the column
    for j in integers_with_nan:
        temp_mean=np.mean(df[j])
        df[j]=df[j].fillna(temp_mean)
    return df

# Output file to csv

In [131]:
def output_file(folder_path,df):
    csv_file_name=input("What do you want to name the new file?")
    output_path=folder_path+ "\\" +csv_file_name+'.csv'
    df.to_csv(output_path)


# Working Space

In [48]:
####1. Load relevant csv files. 
### INPUT -- folder_path for file

# Update folder_path to grab all csv files in folder
folder_path = r'C:\Users\MichaelTaylo_c9zoof3\Documents\GitHub\Ironhack-Labs\People Analytics\Data'

df_list=load_csv_dataframes(folder_path)


####2. Clean and standardize data
for i in df_list:
    
    #a. Lowercase columns
    i.columns=col_lowercase(i)
    
    #b. Lowercase string values
    i.update(string_lowercase(i))
    
    #c. Removes blank rows and duplicate records
    drop_duplicates_and_blank_lines(i)

####3. Merge files
if len(df_list)>1:
    #a. Check which columns are inconsistent across files
    print(list_unique_columns(df_list))

    #b. Rename columns
    df_list=rename_columns(df_list)

    #c. Merge files
    merge_files(df_list)

{1: ['state'], 2: ['state'], 3: ['st']}
What is the column name that you want to fix? (none if done)st
What is the correct column name? (none to cancel)state
What is the column name that you want to fix? (none if done)none
{'customer': array(['rb50392', 'qz44356', 'ai49188', ..., 'td14365', 'up19263',
       'y167826'], dtype=object), 'state': array(['washington', 'arizona', 'nevada', 'california', 'oregon', 'cali',
       'az', 'wa'], dtype=object), 'gender': array([nan, 'f', 'm', 'femal', 'male', 'female'], dtype=object), 'education': array(['master', 'bachelor', 'high school or below', 'college',
       'bachelors', 'doctor'], dtype=object), 'customer lifetime value': array([nan, '697953.59%', '1288743.17%', ..., 8163.890428, 7524.442436,
       2611.836866], dtype=object), 'number of open complaints': array(['1/0/00', '1/2/00', '1/1/00', '1/3/00', '1/5/00', '1/4/00', 0, 2,
       3, 1, 5, 4], dtype=object), 'policy type': array(['personal auto', 'corporate auto', 'special auto'], d

In [100]:
####4. Explore data -- review data by eye

#a. Check data types for accuracy
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9137 entries, 0 to 9136
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   customer                   9137 non-null   object 
 1   state                      9137 non-null   object 
 2   gender                     9015 non-null   object 
 3   education                  9137 non-null   object 
 4   customer lifetime value    9130 non-null   object 
 5   income                     9137 non-null   float64
 6   monthly premium auto       9137 non-null   float64
 7   number of open complaints  9137 non-null   object 
 8   policy type                9137 non-null   object 
 9   vehicle class              9137 non-null   object 
 10  total claim amount         9137 non-null   float64
dtypes: float64(3), object(8)
memory usage: 785.3+ KB


In [101]:
#b. Review content by eye. Check for problems.
qgrid.show_grid(df,grid_options={'forceFitColumns': False})

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': False, 'defa…

In [83]:
####5. Missing data

#a. Check missing value percentage to decide what to do with them
count_missing_values(df)

customer                     0.000000
state                        0.000000
gender                       0.013533
education                    0.000000
customer lifetime value      0.000767
income                       0.000000
monthly premium auto         0.000000
number of open complaints    0.000000
policy type                  0.000000
vehicle class                0.000000
total claim amount           0.000000
dtype: float64

In [None]:
#OPTIONAL 
#b. Drop columns due to high NaN.  Fill in 'drop_list' with columns you want to drop before running it
### INPUT -- drop_list of columns to drop

drop_list=[]

drop_columns(df,drop_list)

In [92]:
#OPTIONAL 
#c. Remove rows with Nans.  Keeps rows with at least so many values.  You can change the 'keep' threshold by updating the 'threshold_percent'
### INPUT -- threshold percent of rows to drop

threshold_percent=.5

drop_threshold=math.floor(len(df.columns)*threshold_percent)
df.dropna(axis=0,thresh=drop_threshold,inplace=True)

5

In [None]:
####6. Standardize Categorical

#a. Get current list of categorical values per column.  Check to see which need to be normalized
check_categoricals(df)

In [None]:
### OPTIONAL 
###b. Provide correct standards so that old values are re-written with new ones
### INPUT -- a dictionary of values

a={
    'sex':
        {'female':['f','femal'],
        'male':['male','m']}
}

standardize_categorical_names(df,a)

In [None]:
####7. Fix missing numbers

In [None]:
###OPTIONAL
###a. Replace blank with mean
### INPUT -- integers_with_nan if you want special list vs all integer columns


# Change missing number with mean.  If you want to apply to all integer columns, use first function below.  
# Otherwise, comment it out and customize the list.

#Gather all integer columns with NaN.
integers_with_nan=get_columns_with_nan(df)
#integers_with_nan=[list]


#Replace missing numbers with mean
df=replace_nan_integer_with_mean(df,integers_with_nan)

In [115]:
####8. Output dataframe to csv file
output_file(folder_path,df)

What do you want to name the new file?test2
