# Prepare environment

## Imports and load libraries

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import quantile_transform
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import OneHotEncoder  ##. better to use dummy from pandas 
from sklearn.preprocessing import PowerTransformer
from scipy.stats import boxcox
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from scipy.stats import boxcox
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
pd.options.display.max_rows = 50
import qgrid

## Load relevant dataframe from file

### One file with all data <font color='red'>(Need input)

In [12]:
#If you have only one file, then hoose your data source to load below
#It will be named 'df' for use in this notebook

df=sns.load_dataset('titanic')
file_list=[df]

### Multiple files that need to be merged <font color='red'>(Need input)

In [13]:
#If you have multiple files to merge, use this instead.
#Update the list of file names to ensure loops will work on the data in order to merge at the end

file1=pd.read_csv(r'C:\Users\MichaelTaylo_c9zoof3\Documents\GitHub\Ironhack-Labs\Case Studies\Customer Analysis Case Study\Data\file1.csv')
file2=pd.read_csv(r'C:\Users\MichaelTaylo_c9zoof3\Documents\GitHub\Ironhack-Labs\Case Studies\Customer Analysis Case Study\Data\file2.csv')
file3=pd.read_csv(r'C:\Users\MichaelTaylo_c9zoof3\Documents\GitHub\Ironhack-Labs\Case Studies\Customer Analysis Case Study\Data\file3.csv')

file_list=[file1,file2,file3]

# Quick clean and standardizing data (e.g. lowercase titles)

## Lowercase column names

In [14]:
def col_lowercase(df):
    '''Takes one value, dataframe, and changes all columns to lowercase
    For easy comparison and merging of multiple dataframes'''
    df_temp = [i.lower() for i in df.columns]
    return df_temp

for i in file_list:
    i.columns=col_lowercase(i)

## Lowercase all string values in file

In [15]:
def string_lowercase(df):
    '''Takes one value, dataframe, and changes all values that are strings to all lowercase
    For easy identification of discrete values and for merging of dataframes'''
    df=df.applymap(lambda x:x.lower() if type(x) == str else x)
    return df

for i in file_list:
    i.update(string_lowercase(i))

## Removing duplicate records

In [16]:
for i in file_list:
    #Removes duplicates if entire row is the same
    i.drop_duplicates(inplace=True)
    #Removes rows if they are completely blank
    i.dropna(how="all",axis=0, inplace=True)

# Merging files

## Check files for unique columns

In [19]:
#Gather column names from all files in a list
all_col_names=[]
temp_col_names=[]
unmatched_col_names=[]
unmatched_col_dict={}

for i in file_list:
    [all_col_names.append(col) for col in i.columns]
        

#Remove duplicates to create a unique set of column names
unique_col_names=list(set(all_col_names))

#Check which files are missing which column names. Place these in a dictionary with key of file number.
file_num=1
for x in file_list:
    [temp_col_names.append(col) for col in x.columns]
    [unmatched_col_names.append(y) for y in unique_col_names if y not in temp_col_names]        
    unmatched_col_dict[file_num]=unmatched_col_names
    unmatched_col_names=[]
    temp_col_names=[]
    file_num+=1

unmatched_col_dict

{1: [], 2: [], 3: []}

## Space to do renaming of columns <font color='red'>(Need input)

In [23]:
#If quick renaming of fles, can use the dictionary form. Change ##df## to dataframe name.
for i in file_list:
    i.rename(columns={"st":"state",},inplace=True)

In [None]:
#If need more complex format, then can use a zip instead. Change ##df## to dataframe name.
old_names={'a','b','c'}
new_names={'x','y','z'}
zip_file=zip(old_names,new_names)

for i in file_list:
    i.rename(columns=zip_file,inplace=True)

## Merging of files

In [27]:
df=pd.concat(file_list,axis=0,ignore_index=True)

Unnamed: 0,customer,state,gender,education,customer lifetime value,income,monthly premium auto,number of open complaints,policy type,vehicle class,total claim amount
0,rb50392,washington,,master,,0.0,1000.0,1/0/00,personal auto,four-door car,2.704934
1,qz44356,arizona,f,bachelor,697953.59%,0.0,94.0,1/0/00,personal auto,four-door car,1131.464935
2,ai49188,nevada,f,bachelor,1288743.17%,48767.0,108.0,1/0/00,personal auto,two-door car,566.472247
3,ww63253,california,m,bachelor,764586.18%,0.0,106.0,1/0/00,corporate auto,suv,529.881344
4,ga49547,washington,m,high school or below,536307.65%,36357.0,68.0,1/0/00,personal auto,four-door car,17.269323
...,...,...,...,...,...,...,...,...,...,...,...
9132,la72316,california,m,bachelor,23405.98798,71941.0,73.0,0,personal auto,four-door car,198.234764
9133,pk87824,california,f,college,3096.511217,21604.0,79.0,0,corporate auto,four-door car,379.200000
9134,td14365,california,m,bachelor,8163.890428,0.0,85.0,3,corporate auto,four-door car,790.784983
9135,up19263,california,m,college,7524.442436,21941.0,96.0,0,personal auto,four-door car,691.200000


# Replace missing values

## Replace missing numbers with mean

In [11]:
# This creates a list ## integers_with_nan ## which is a list of numerical columns with blanks in them.  
#The next parts will give you options for how to handle the blanks

col_list=[]
col_list=df.columns
integers_with_nan=[]
for i in col_list:
    if df[i].isnull().values.any()==True:
        if df[i].dtype!='O':
            integers_with_nan.append(i)

In [None]:
#Fill missing numbers with the average of the column
for j in integers_with_nan:
    temp_mean=np.mean(df[j])
    df[j]=df[j].fillna(temp_mean)

# Review data

## Check data types and blanks

In [None]:
df.info()

## Eyeball check data for issues

In [None]:
qgrid.show_grid(df,grid_options={'forceFitColumns': False})

# Fix categorical values

## Check categorical columns for misspellings or issues

In [10]:
# Creates a copy of the dataframe for string-based columns only.  
# Then, creates a dictionary of the column (key) plus array of values (value) to show the results
categorical_df=df.select_dtypes(object)
unique_dict={col: categorical_df[col].unique() for col in categorical_df}
unique_dict

{'sex': array(['male', 'female'], dtype=object),
 'embarked': array(['s', 'c', 'q', nan], dtype=object),
 'class': array(['third', 'first', 'second'], dtype=object),
 'who': array(['man', 'woman', 'child'], dtype=object),
 'deck': array([nan, 'c', 'e', 'g', 'd', 'a', 'b', 'f'], dtype=object),
 'embark_town': array(['southampton', 'cherbourg', 'queenstown', nan], dtype=object),
 'alive': array(['no', 'yes'], dtype=object)}

## Fix categorical values by replacing values with new ones

### Replace a list of terms with one standard version <font color='red'>(Need input)

In [12]:
#Fill in 'a' below

# Creates a dictionary ##a## of values to correct.  
# Outer dictionary is the column to correct
# Inner dictionary key is the correct spelling, then list (dictionary value) is the incorrect spellings to find and replace

a={
    'sex':
        {'female':['f','femal'],
        'male':['male','m']}
}


#This code will run through the dictionary a and update string values in the dataframe
#Using the new string values
for key in a.keys():
    for key2 in a[key].keys():
        list_values=a[key][key2]
        df.loc[df[str(key)].isin(list_values),key]=str(key2)

### Replace terms 1:1 with an alternative

In [None]:
#Fill in these three with correct values, then run to update a column with new values.
original_version=['f','m']          #List in order of terms in column to change
new_version=['female','male']       #List in order of original version list of new terms to use
column_name='gender'                #Column name to use in action

replace_map=dict(zip(original_version,new_version))

df[column_name].replace(replace_map,inplace=True)
