# Prepare environment

## Imports and load libraries

In [94]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import quantile_transform
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import OneHotEncoder  ##. better to use dummy from pandas 
from sklearn.preprocessing import PowerTransformer
from scipy.stats import boxcox
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from scipy.stats import boxcox
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
pd.options.display.max_rows = 50
import qgrid

## Load relevant dataframe from file

### One file with all data <font color='red'>(Need input)

In [95]:
#If you have only one file, then hoose your data source to load below
#It will be named 'df' for use in this notebook

df=sns.load_dataset('titanic')
file_list=[df]

### Multiple files that need to be merged <font color='red'>(Need input)

In [96]:
#If you have multiple files to merge, use this instead.
#Update the list of file names to ensure loops will work on the data in order to merge at the end

file1=pd.read_csv(r'C:\Users\MichaelTaylo_c9zoof3\Documents\GitHub\Ironhack-Labs\Case Studies\Customer Analysis Case Study\Data\file1.csv' names='file1')
file2=pd.read_csv(r'C:\Users\MichaelTaylo_c9zoof3\Documents\GitHub\Ironhack-Labs\Case Studies\Customer Analysis Case Study\Data\file2.csv')
file3=pd.read_csv(r'C:\Users\MichaelTaylo_c9zoof3\Documents\GitHub\Ironhack-Labs\Case Studies\Customer Analysis Case Study\Data\file3.csv')

file_list=[file1,file2,file3]

# Quick clean and standardizing data (e.g. lowercase titles)

## Lowercase column names

In [97]:
def col_lowercase(df):
    '''Takes one value, dataframe, and changes all columns to lowercase
    For easy comparison and merging of multiple dataframes'''
    df_temp = [i.lower() for i in df.columns]
    return df_temp

for i in file_list:
    i.columns=col_lowercase(i)

## Lowercase all string values in file

In [116]:
def string_lowercase(df):
    '''Takes one value, dataframe, and changes all values that are strings to all lowercase
    For easy identification of discrete values and for merging of dataframes'''
    df=df.applymap(lambda x:x.lower() if type(x) == str else x)
    return df

for i in file_list:
    i.update(string_lowercase(i))

Unnamed: 0,customer,st,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,total claim amount,policy type,vehicle class
0,gs98873,arizona,f,bachelor,323912.47%,16061,88,1/0/00,633.600000,personal auto,four-door car
1,cw49887,california,f,master,462680.11%,79487,114,1/0/00,547.200000,special auto,suv
2,my31220,california,f,college,899704.02%,54230,112,1/0/00,537.600000,personal auto,two-door car
3,uh35128,oregon,f,college,2580706.30%,71210,214,1/1/00,1027.200000,personal auto,luxury car
4,wh52799,arizona,f,college,380812.21%,94903,94,1/0/00,451.200000,corporate auto,two-door car
...,...,...,...,...,...,...,...,...,...,...,...
991,hv85198,arizona,m,master,847141.75%,63513,70,1/0/00,185.667213,personal auto,four-door car
992,bs91566,arizona,f,college,543121.91%,58161,68,1/0/00,140.747286,corporate auto,four-door car
993,il40123,nevada,f,college,568964.41%,83640,70,1/0/00,471.050488,corporate auto,two-door car
994,my32149,california,f,master,368672.38%,0,96,1/0/00,28.460568,personal auto,two-door car


## Removing duplicate records

In [8]:
for i in file_list:
    #Removes duplicates if entire row is the same
    i.drop_duplicates(inplace=True)
    #Removes rows if they are completely blank
    i.dropna(how="all",axis=0, inplace=True)

# Merging files

## Check files for unique columns

In [151]:
#Gather column names from all files in a list
all_col_names=[]
temp_col_names=[]
unmatched_col_names=[]
unmatched_col_dict={}

for i in file_list:
    [all_col_names.append(col) for col in i.columns]
        

#Remove duplicates to create a unique set of column names
unique_col_names=list(set(list_col_names))

#Check which files are missing which column names
for x in file_list:
    [temp_col_names.append(col) for col in x.columns]
    [unmatched_col_names.append(y) for y in unique_col_names if y not in temp_col_names]        
    unmatched_col_dict[x.index.name]=unmatched_col_names
    temp_col_names=[]
    unmatched_col_names=[]
unmatched_col_dict

{None: ['st']}

AttributeError: 'DataFrame' object has no attribute 'name'

# Replace missing values

## Replace missing numbers with mean

In [11]:
# This creates a list ## integers_with_nan ## which is a list of numerical columns with blanks in them.  
#The next parts will give you options for how to handle the blanks

col_list=[]
col_list=df.columns
integers_with_nan=[]
for i in col_list:
    if df[i].isnull().values.any()==True:
        if df[i].dtype!='O':
            integers_with_nan.append(i)

In [None]:
#Fill missing numbers with the average of the column
for j in integers_with_nan:
    temp_mean=np.mean(df[j])
    df[j]=df[j].fillna(temp_mean)

# Review data

## Check data types and blanks

In [None]:
df.info()

## Eyeball check data for issues

In [None]:
qgrid.show_grid(df,grid_options={'forceFitColumns': False})

# Fix categorical values

## Check categorical columns for misspellings or issues

In [10]:
# Creates a copy of the dataframe for string-based columns only.  
# Then, creates a dictionary of the column (key) plus array of values (value) to show the results
categorical_df=df.select_dtypes(object)
unique_dict={col: categorical_df[col].unique() for col in categorical_df}
unique_dict

{'sex': array(['male', 'female'], dtype=object),
 'embarked': array(['s', 'c', 'q', nan], dtype=object),
 'class': array(['third', 'first', 'second'], dtype=object),
 'who': array(['man', 'woman', 'child'], dtype=object),
 'deck': array([nan, 'c', 'e', 'g', 'd', 'a', 'b', 'f'], dtype=object),
 'embark_town': array(['southampton', 'cherbourg', 'queenstown', nan], dtype=object),
 'alive': array(['no', 'yes'], dtype=object)}

## Fix categorical values by replacing values with new ones

### Replace a list of terms with one standard version <font color='red'>(Need input)

In [12]:
#Fill in 'a' below

# Creates a dictionary ##a## of values to correct.  
# Outer dictionary is the column to correct
# Inner dictionary key is the correct spelling, then list (dictionary value) is the incorrect spellings to find and replace

a={
    'sex':
        {'female':['f','femal'],
        'male':['male','m']}
}


#This code will run through the dictionary a and update string values in the dataframe
#Using the new string values
for key in a.keys():
    for key2 in a[key].keys():
        list_values=a[key][key2]
        df.loc[df[str(key)].isin(list_values),key]=str(key2)

### Replace terms 1:1 with an alternative

In [None]:
#Fill in these three with correct values, then run to update a column with new values.
original_version=['f','m']          #List in order of terms in column to change
new_version=['female','male']       #List in order of original version list of new terms to use
column_name='gender'                #Column name to use in action

replace_map=dict(zip(original_version,new_version))

df[column_name].replace(replace_map,inplace=True)
