# Demonstration of Cleaning Function #

The point of this function is to rename the columns so they are meaningful
and then binarize each column value so that it can easily be summed for statistical
analysis.

## Initial Dataframe ##

In [None]:
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

import re
import os


In [None]:
df = pd.read_csv("../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv")
df = df.drop(df.index[0], axis = 0)
df.head()

## Cleaning Function ##

In [None]:
number_pattern = "\d+_"

In [None]:
df = pd.read_csv("../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv")
df = df.drop(df.index[0], axis = 0)

def clean_df(df):
    df_cols = df.columns
    
    for col in df.columns:
        
        #print(col)
        
        if "_Part_" in col:
            try:
                # Check for the unique value of this.
                col_number = re.findall(number_pattern, col)[0].replace("_", "")
                
                unique_colval = df[col].dropna().unique()
                
                col_value = unique_colval[0]
                
                real_col_name = "Q" + str(col_number) + "_" + str(col_value)
                
                #print(real_col_name)
                
                df[col] = df[col].notna()
                
                df.rename(columns = {col:real_col_name}, inplace = True)
                
                
            except:
                print("couldn't rename", col)
            
        else:
            pass
    
    return df

clean_df(df).head(3)

# Clean the colnames #

In [None]:
def clean_colnames(df):
    df.columns = map(str.lower, df.columns)
    df.columns = df.columns.str.replace(' ', '_')
    df.columns = df.columns.str.replace('(', '')
    df.columns = df.columns.str.replace(')', '_')
    df.columns = df.columns.str.replace('+', 'plus')
    df.columns = df.columns.str.replace('/', 'or')
    df.columns = df.columns.str.replace('.', '_')
    df.columns = df.columns.str.replace('__', '_')
    df.columns = df.columns.str.replace('___', '_')
    
    # BUG TO FIX - None of the other columns are working.
    # This column wasn't working.  Fix it here.
    # df["q35_b_other"] = df["q35_b_other"].notna()
    
    return df

df = clean_colnames(df)
df.head()

## Fix the "Other" Columns, which were not converting

In [None]:
def fix_others(df):
    
    for col in df.columns:
        if "other" in col:
            try:
                df[col] = df[col].notna()
            except:
                print("did not convert")

    return(df)

df = fix_others(df)

In [None]:
df.head()

## Download the CSV here or use the dataset ##

In [None]:
df.to_csv("cleaned_kaggle_questionere.csv", index = False)

## Illustrations of Simpler Graphing and processing ##

You can now get proportions of answers easily.

In [None]:
df.mean()

### Use df.filter(like = "") to isolate questions

You can also query the dataframe for the question itself and return a subset that matters to you

In [None]:
df.filter(like = "q35")

And then you can get value counts for all the rows

In [None]:
df.filter(like = "q35").sum()

In [None]:
question_number = "q35"

sums = df.filter(like = question_number).sum()
ax = sns.barplot(sums.index, sums)

names = list(df.filter(like = question_number).columns)
remove_questions_names = [str(answer)
                          .replace(question_number, "") for answer in names]

ax.set_xticklabels(
    remove_questions_names, 
    rotation=60, 
    horizontalalignment='right'
)

plt.title(question_number)
plt.show()